fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions

- Fixed `--mock` mode - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step. - Implemented mock mode in `WebArenaChallenge` - Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted - Fixed handling of `python`/`pytest` evals in `BuiltinChallenge` - Disabled left-over Helicone code (see 056163e) - Fixed a couple of challenge definitions - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*) - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list - Added some debug logging in agent_api_interface.py and challenges/builtin.py
2025-12-17 22:14:28 +01:00 · 2024-02-14 01:05:34 +01:00
parent bb7f5abc6c
commit 327fb1f916
6 changed files with 63 additions and 44 deletions
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -23,6 +23,8 @@ async def run_api_agent(
    config: AgentBenchmarkConfig,
    timeout: int,
    artifacts_location: Optional[Path] = None,
+    *,
+    mock: bool = False,
 ) -> AsyncIterator[Step]:
    configuration = Configuration(host=config.host)
    async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
        task_id = response.task_id

        if artifacts_location:
+            logger.debug("Uploading task input artifacts to agent...")
            await upload_artifacts(
                api_instance, artifacts_location, task_id, "artifacts_in"
            )

+        logger.debug("Running agent until finished or timeout...")
        while True:
            step = await api_instance.execute_agent_task_step(task_id=task_id)
            yield step

            if time.time() - start_time > timeout:
                raise TimeoutError("Time limit exceeded")
+            if step and mock:
+                step.is_last = True
            if not step or step.is_last:
                break

        if artifacts_location:
            # In "mock" mode, we cheat by giving the correct artifacts to pass the test
-            if os.getenv("IS_MOCK"):
+            if mock:
+                logger.debug("Uploading mock artifacts to agent...")
                await upload_artifacts(
                    api_instance, artifacts_location, task_id, "artifacts_out"
                )

+            logger.debug("Downloading agent artifacts...")
            await download_agent_artifacts_into_folder(
                api_instance, task_id, config.temp_folder
            )