fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions

- Fixed `--mock` mode
   - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
   - Implemented mock mode in `WebArenaChallenge`

- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted

- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`

- Disabled left-over Helicone code (see 056163e)

- Fixed a couple of challenge definitions
   - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
   - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list

- Added some debug logging in agent_api_interface.py and challenges/builtin.py
This commit is contained in:
Reinier van der Leer
2024-02-14 01:05:34 +01:00
parent bb7f5abc6c
commit 327fb1f916
6 changed files with 63 additions and 44 deletions

View File

@@ -23,6 +23,8 @@ async def run_api_agent(
config: AgentBenchmarkConfig, config: AgentBenchmarkConfig,
timeout: int, timeout: int,
artifacts_location: Optional[Path] = None, artifacts_location: Optional[Path] = None,
*,
mock: bool = False,
) -> AsyncIterator[Step]: ) -> AsyncIterator[Step]:
configuration = Configuration(host=config.host) configuration = Configuration(host=config.host)
async with ApiClient(configuration) as api_client: async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
task_id = response.task_id task_id = response.task_id
if artifacts_location: if artifacts_location:
logger.debug("Uploading task input artifacts to agent...")
await upload_artifacts( await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_in" api_instance, artifacts_location, task_id, "artifacts_in"
) )
logger.debug("Running agent until finished or timeout...")
while True: while True:
step = await api_instance.execute_agent_task_step(task_id=task_id) step = await api_instance.execute_agent_task_step(task_id=task_id)
yield step yield step
if time.time() - start_time > timeout: if time.time() - start_time > timeout:
raise TimeoutError("Time limit exceeded") raise TimeoutError("Time limit exceeded")
if step and mock:
step.is_last = True
if not step or step.is_last: if not step or step.is_last:
break break
if artifacts_location: if artifacts_location:
# In "mock" mode, we cheat by giving the correct artifacts to pass the test # In "mock" mode, we cheat by giving the correct artifacts to pass the test
if os.getenv("IS_MOCK"): if mock:
logger.debug("Uploading mock artifacts to agent...")
await upload_artifacts( await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_out" api_instance, artifacts_location, task_id, "artifacts_out"
) )
logger.debug("Downloading agent artifacts...")
await download_agent_artifacts_into_folder( await download_agent_artifacts_into_folder(
api_instance, task_id, config.temp_folder api_instance, task_id, config.temp_folder
) )

View File

@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
@classmethod @classmethod
async def run_challenge( async def run_challenge(
cls, config: AgentBenchmarkConfig, timeout: int cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
) -> AsyncIterator[Step]: ) -> AsyncIterator[Step]:
""" """
Runs the challenge on the subject agent with the specified timeout. Runs the challenge on the subject agent with the specified timeout.
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
logger.debug(f"Starting {cls.info.name} challenge run") logger.debug(f"Starting {cls.info.name} challenge run")
i = 0 i = 0
async for step in run_api_agent( async for step in run_api_agent(
cls.info.task, config, timeout, cls.info.task_artifacts_dir cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
): ):
i += 1 i += 1
print(f"[{cls.info.name}] - step {step.name} ({i}. request)") print(f"[{cls.info.name}] - step {step.name} ({i}. request)")

View File

@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
self, self,
config: AgentBenchmarkConfig, config: AgentBenchmarkConfig,
request: pytest.FixtureRequest, request: pytest.FixtureRequest,
i_attempt: int, i_attempt: int = 0,
) -> None: ) -> None:
if os.environ.get("HELICONE_API_KEY"): # if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager # from helicone.lock import HeliconeLockManager
HeliconeLockManager.write_custom_property("challenge", self.info.name) # HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = self._spec.cutoff or 60 timeout = self._spec.cutoff or 60
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
task_id = "" task_id = ""
timed_out = None timed_out = None
try: try:
async for step in self.run_challenge(config, timeout): async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
):
if not task_id: if not task_id:
task_id = step.task_id task_id = step.task_id
if request.config.getoption("--mock"):
# Run only one step in mock mode
break
timed_out = False timed_out = False
except TimeoutError: except TimeoutError:
timed_out = True timed_out = True
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
@classmethod @classmethod
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]: def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
if cls._spec.task == "" and os.getenv("IS_MOCK"):
yield EvalResult(
result="This is a mock answer",
result_source="step_output",
score=1.0,
passed=True,
)
return
result_ground = cls._spec.ground result_ground = cls._spec.ground
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground) outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
passed=score > 0.9, # FIXME: arbitrary threshold passed=score > 0.9, # FIXME: arbitrary threshold
) )
if result_ground.eval.type in ("python", "pytest"):
for py_file, output in outputs_for_eval:
yield EvalResult(
result=output,
result_source=str(py_file),
score=float(not output.startswith("Error:")),
passed=not output.startswith("Error:"),
)
if result_ground.eval.type == "llm": if result_ground.eval.type == "llm":
combined_results = "\n".join(output[1] for output in outputs_for_eval) combined_results = "\n".join(output[1] for output in outputs_for_eval)
llm_eval = cls.score_result_with_llm(combined_results, result_ground) llm_eval = cls.score_result_with_llm(combined_results, result_ground)
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
# Otherwise, it is a specific file # Otherwise, it is a specific file
matching_files = [os.path.join(script_dir, file_pattern)] matching_files = [os.path.join(script_dir, file_pattern)]
logger.debug(
f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
)
for file_path in matching_files: for file_path in matching_files:
relative_file_path = Path(file_path).relative_to(workspace)
logger.debug(
f"Evaluating {relative_file_path} "
f"(eval type: {ground.eval.type})..."
)
if ground.eval.type == "python": if ground.eval.type == "python":
result = subprocess.run( result = subprocess.run(
[sys.executable, file_path], [sys.executable, file_path],
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
text=True, text=True,
) )
if "error" in result.stderr or result.returncode != 0: if "error" in result.stderr or result.returncode != 0:
print(result.stderr) yield relative_file_path, f"Error: {result.stderr}\n"
assert False, result.stderr else:
yield ( yield relative_file_path, f"Output: {result.stdout}\n"
Path(file_path).relative_to(workspace),
f"Output: {result.stdout}\n",
)
else: else:
with open(file_path, "r") as f: with open(file_path, "r") as f:
yield Path(file_path).relative_to(workspace), f.read() yield relative_file_path, f.read()
else: else:
if ground.eval.type == "pytest": if ground.eval.type == "pytest":
result = subprocess.run( result = subprocess.run(
@@ -317,9 +322,9 @@ class BuiltinChallenge(BaseChallenge):
text=True, text=True,
) )
if "error" in result.stderr or result.returncode != 0: if "error" in result.stderr or result.returncode != 0:
print(result.stderr) yield "pytest", f"Error: {result.stderr}\n"
assert False, result.stderr else:
yield "pytest", f"Output: {result.stdout}\n" yield "pytest", f"Output: {result.stdout}\n"
@staticmethod @staticmethod
def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None: def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None:
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
@classmethod @classmethod
def score_result_with_llm( def score_result_with_llm(
cls, content: str, ground: BuiltinChallengeSpec.Ground cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
) -> float: ) -> float:
if os.getenv("IS_MOCK"): if mock:
return 1.0 return 1.0
# the validation for this is done in the Eval BaseModel # the validation for this is done in the Eval BaseModel

View File

@@ -18,9 +18,7 @@
"files": [ "files": [
"output.txt" "output.txt"
], ],
"should_contain": [ "should_contain": [],
""
],
"should_not_contain": [] "should_not_contain": []
}, },
"info": { "info": {

View File

@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
return results return results
@classmethod @classmethod
def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]: def evaluate_step_result(
cls, step: Step, *, mock: bool = False
) -> list[tuple[_Eval, EvalResult]]:
if mock:
step.output = cls.info.reference_answer
assert step.output assert step.output
eval_results = cls.evaluate_answer(step.output) eval_results = cls.evaluate_answer(step.output)
for eval in cls._spec.eval.evaluators: for eval in cls._spec.eval.evaluators:
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
self, self,
config: AgentBenchmarkConfig, config: AgentBenchmarkConfig,
request: pytest.FixtureRequest, request: pytest.FixtureRequest,
i_attempt: int, i_attempt: int = 0,
) -> None: ) -> None:
if os.environ.get("HELICONE_API_KEY"): # if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager # from helicone.lock import HeliconeLockManager
HeliconeLockManager.write_custom_property("challenge", self.info.name) # HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = 120 timeout = 120
if request.config.getoption("--nc"): if request.config.getoption("--nc"):
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
timed_out = None timed_out = None
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = [] eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try: try:
async for step in self.run_challenge(config, timeout): async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
):
if not step.output: if not step.output:
logger.warn(f"Step has no output: {step}") logger.warn(f"Step has no output: {step}")
continue continue
step_eval_results = self.evaluate_step_result(step) step_eval_results = self.evaluate_step_result(
step, mock=request.config.getoption("--mock")
)
logger.debug(f"Intermediary results: {step_eval_results}") logger.debug(f"Intermediary results: {step_eval_results}")
eval_results_per_step.append(step_eval_results) eval_results_per_step.append(step_eval_results)
if step.is_last: if step.is_last:

View File

@@ -334,7 +334,7 @@
"June: 13 orders", "June: 13 orders",
"July: 9 orders", "July: 9 orders",
"August: 8 orders", "August: 8 orders",
"Sepetember: 10 orders", "September: 10 orders",
"October: 4 orders", "October: 4 orders",
"November: 5 orders", "November: 5 orders",
"December: 10 orders" "December: 10 orders"
@@ -343,7 +343,7 @@
"reference_url": "", "reference_url": "",
"program_html": [], "program_html": [],
"string_note": "", "string_note": "",
"reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " "reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
}, },
"intent_template_id": 270, "intent_template_id": 270,
"string_note": null, "string_note": null,