diff --git a/bench_engines.py b/bench_engines.py index 2efdc2a7f..d818de6ce 100644 --- a/bench_engines.py +++ b/bench_engines.py @@ -38,6 +38,8 @@ def get_engine(model_class: str, model_id: str, context_size: int = None): return OpenAIEngine(model="gpt-4o-2024-05-13", temperature=0, max_context_size=context_size) if model_id == "gpt-3.5-turbo-0125": return OpenAIEngine(model="gpt-3.5-turbo-0125", temperature=0, max_context_size=context_size) + if model_id == "gpt-4o-mini": + return OpenAIEngine(model="gpt-4o-mini", temperature=0, max_context_size=context_size) # ==== MISTRAL ==== if model_class == "mistral": from kani.ext.vllm import VLLMEngine @@ -166,7 +168,6 @@ def get_engine(model_class: str, model_id: str, context_size: int = None): }, sampling_params=SamplingParams(temperature=0.7, max_tokens=2048, min_tokens=1), ) - # todo: cohere raise ValueError("unknown engine") diff --git a/bench_webarena.py b/bench_webarena.py index aa9c313b7..8b99fb2ed 100644 --- a/bench_webarena.py +++ b/bench_webarena.py @@ -81,14 +81,17 @@ def wa_ensure_auth(config_file: Path) -> Path: comb = get_site_comb_from_filepath(cookie_file_name) temp_dir = tempfile.mkdtemp() # subprocess to renew the cookie - subprocess.run([ - "python", - "experiments/webarena/auto_login.py", - "--auth_folder", - temp_dir, - "--site_list", - *comb, - ]) + subprocess.run( + [ + "python", + "experiments/webarena/auto_login.py", + "--auth_folder", + temp_dir, + "--site_list", + *comb, + ], + check=True, + ) _c["storage_state"] = f"{temp_dir}/{cookie_file_name}" assert os.path.exists(_c["storage_state"]) # write a temp copy of the config file diff --git a/redel/tools/webarena/patches.py b/redel/tools/webarena/patches.py index 5661c4555..8022cb453 100644 --- a/redel/tools/webarena/patches.py +++ b/redel/tools/webarena/patches.py @@ -139,15 +139,15 @@ def patch_to_support_webarena(): # WebArena runs a subprocess to login to get cookies # which spews logs / warnings, so we silence them - _subprocess_run = subprocess.run - - def subprocess_run(*args, **kwargs): - if any("auto_login.py" in a for a in args[0]): - kwargs["stdout"] = subprocess.PIPE - kwargs["stderr"] = subprocess.PIPE - return _subprocess_run(*args, **kwargs) - - subprocess.run = lambda *args, **kwargs: subprocess_run(*args, **kwargs) + # _subprocess_run = subprocess.run + # + # def subprocess_run(*args, **kwargs): + # if any("auto_login.py" in a for a in args[0]): + # kwargs["stdout"] = subprocess.PIPE + # kwargs["stderr"] = subprocess.PIPE + # return _subprocess_run(*args, **kwargs) + # + # subprocess.run = lambda *args, **kwargs: subprocess_run(*args, **kwargs) # WebArena's get_bounding_client_rect method is very slow with ignore_webarena_warnings(): diff --git a/test-webarena.sh b/test-webarena.sh new file mode 100644 index 000000000..96c7fca27 --- /dev/null +++ b/test-webarena.sh @@ -0,0 +1,4 @@ +#!/bin/zsh + +source slurm/webarena-env.sh +python bench_webarena.py --config baseline --model-class openai --large-model gpt-4o-mini --small-model gpt-4o-mini --save-dir experiments/webarena/dev/baseline