fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions

- Fixed `--mock` mode - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step. - Implemented mock mode in `WebArenaChallenge` - Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted - Fixed handling of `python`/`pytest` evals in `BuiltinChallenge` - Disabled left-over Helicone code (see 056163e) - Fixed a couple of challenge definitions - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*) - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list - Added some debug logging in agent_api_interface.py and challenges/builtin.py
2025-12-18 14:34:23 +01:00 · 2024-02-14 01:05:34 +01:00
parent bb7f5abc6c
commit 327fb1f916
6 changed files with 63 additions and 44 deletions
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -23,6 +23,8 @@ async def run_api_agent(
    config: AgentBenchmarkConfig,
    timeout: int,
    artifacts_location: Optional[Path] = None,
+    *,
+    mock: bool = False,
 ) -> AsyncIterator[Step]:
    configuration = Configuration(host=config.host)
    async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
        task_id = response.task_id

        if artifacts_location:
+            logger.debug("Uploading task input artifacts to agent...")
            await upload_artifacts(
                api_instance, artifacts_location, task_id, "artifacts_in"
            )

+        logger.debug("Running agent until finished or timeout...")
        while True:
            step = await api_instance.execute_agent_task_step(task_id=task_id)
            yield step

            if time.time() - start_time > timeout:
                raise TimeoutError("Time limit exceeded")
+            if step and mock:
+                step.is_last = True
            if not step or step.is_last:
                break

        if artifacts_location:
            # In "mock" mode, we cheat by giving the correct artifacts to pass the test
-            if os.getenv("IS_MOCK"):
+            if mock:
+                logger.debug("Uploading mock artifacts to agent...")
                await upload_artifacts(
                    api_instance, artifacts_location, task_id, "artifacts_out"
                )

+            logger.debug("Downloading agent artifacts...")
            await download_agent_artifacts_into_folder(
                api_instance, task_id, config.temp_folder
            )
--- a/benchmark/agbenchmark/challenges/base.py
+++ b/benchmark/agbenchmark/challenges/base.py
@@ -60,7 +60,7 @@ class BaseChallenge(ABC):

    @classmethod
    async def run_challenge(
-        cls, config: AgentBenchmarkConfig, timeout: int
+        cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
    ) -> AsyncIterator[Step]:
        """
        Runs the challenge on the subject agent with the specified timeout.
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
        logger.debug(f"Starting {cls.info.name} challenge run")
        i = 0
        async for step in run_api_agent(
-            cls.info.task, config, timeout, cls.info.task_artifacts_dir
+            cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
        ):
            i += 1
            print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
        self,
        config: AgentBenchmarkConfig,
        request: pytest.FixtureRequest,
-        i_attempt: int,
+        i_attempt: int = 0,
    ) -> None:
-        if os.environ.get("HELICONE_API_KEY"):
-            from helicone.lock import HeliconeLockManager
+        # if os.environ.get("HELICONE_API_KEY"):
+        #     from helicone.lock import HeliconeLockManager

-            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+        #     HeliconeLockManager.write_custom_property("challenge", self.info.name)

        timeout = self._spec.cutoff or 60

@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
        task_id = ""
        timed_out = None
        try:
-            async for step in self.run_challenge(config, timeout):
+            async for step in self.run_challenge(
+                config, timeout, mock=request.config.getoption("--mock")
+            ):
                if not task_id:
                    task_id = step.task_id
-                if request.config.getoption("--mock"):
-                    # Run only one step in mock mode
-                    break
            timed_out = False
        except TimeoutError:
            timed_out = True
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):

    @classmethod
    def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
-        if cls._spec.task == "" and os.getenv("IS_MOCK"):
-            yield EvalResult(
-                result="This is a mock answer",
-                result_source="step_output",
-                score=1.0,
-                passed=True,
-            )
-            return
-
        result_ground = cls._spec.ground
        outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)

@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
                        passed=score > 0.9,  # FIXME: arbitrary threshold
                    )

+        if result_ground.eval.type in ("python", "pytest"):
+            for py_file, output in outputs_for_eval:
+                yield EvalResult(
+                    result=output,
+                    result_source=str(py_file),
+                    score=float(not output.startswith("Error:")),
+                    passed=not output.startswith("Error:"),
+                )
+
        if result_ground.eval.type == "llm":
            combined_results = "\n".join(output[1] for output in outputs_for_eval)
            llm_eval = cls.score_result_with_llm(combined_results, result_ground)
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
                # Otherwise, it is a specific file
                matching_files = [os.path.join(script_dir, file_pattern)]

+            logger.debug(
+                f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
+            )
+
            for file_path in matching_files:
+                relative_file_path = Path(file_path).relative_to(workspace)
+                logger.debug(
+                    f"Evaluating {relative_file_path} "
+                    f"(eval type: {ground.eval.type})..."
+                )
                if ground.eval.type == "python":
                    result = subprocess.run(
                        [sys.executable, file_path],
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
                        text=True,
                    )
                    if "error" in result.stderr or result.returncode != 0:
-                        print(result.stderr)
-                        assert False, result.stderr
-                    yield (
-                        Path(file_path).relative_to(workspace),
-                        f"Output: {result.stdout}\n",
-                    )
+                        yield relative_file_path, f"Error: {result.stderr}\n"
+                    else:
+                        yield relative_file_path, f"Output: {result.stdout}\n"
                else:
                    with open(file_path, "r") as f:
-                        yield Path(file_path).relative_to(workspace), f.read()
+                        yield relative_file_path, f.read()
        else:
            if ground.eval.type == "pytest":
                result = subprocess.run(
@@ -317,8 +322,8 @@ class BuiltinChallenge(BaseChallenge):
                    text=True,
                )
                if "error" in result.stderr or result.returncode != 0:
-                    print(result.stderr)
-                    assert False, result.stderr
+                    yield "pytest", f"Error: {result.stderr}\n"
+                else:
                    yield "pytest", f"Output: {result.stdout}\n"

    @staticmethod
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):

    @classmethod
    def score_result_with_llm(
-        cls, content: str, ground: BuiltinChallengeSpec.Ground
+        cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
    ) -> float:
-        if os.getenv("IS_MOCK"):
+        if mock:
            return 1.0

        # the validation for this is done in the Eval BaseModel
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
@@ -18,9 +18,7 @@
        "files": [
            "output.txt"
        ],
-        "should_contain": [
-            ""
-        ],
+        "should_contain": [],
        "should_not_contain": []
    },
    "info": {
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
        return results

    @classmethod
-    def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
+    def evaluate_step_result(
+        cls, step: Step, *, mock: bool = False
+    ) -> list[tuple[_Eval, EvalResult]]:
+        if mock:
+            step.output = cls.info.reference_answer
        assert step.output
        eval_results = cls.evaluate_answer(step.output)
        for eval in cls._spec.eval.evaluators:
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
        self,
        config: AgentBenchmarkConfig,
        request: pytest.FixtureRequest,
-        i_attempt: int,
+        i_attempt: int = 0,
    ) -> None:
-        if os.environ.get("HELICONE_API_KEY"):
-            from helicone.lock import HeliconeLockManager
+        # if os.environ.get("HELICONE_API_KEY"):
+        #     from helicone.lock import HeliconeLockManager

-            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+        #     HeliconeLockManager.write_custom_property("challenge", self.info.name)

        timeout = 120
        if request.config.getoption("--nc"):
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
        timed_out = None
        eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
        try:
-            async for step in self.run_challenge(config, timeout):
+            async for step in self.run_challenge(
+                config, timeout, mock=request.config.getoption("--mock")
+            ):
                if not step.output:
                    logger.warn(f"Step has no output: {step}")
                    continue
-                step_eval_results = self.evaluate_step_result(step)
+                step_eval_results = self.evaluate_step_result(
+                    step, mock=request.config.getoption("--mock")
+                )
                logger.debug(f"Intermediary results: {step_eval_results}")
                eval_results_per_step.append(step_eval_results)
                if step.is_last:
--- a/benchmark/agbenchmark/challenges/webarena_selection.json
+++ b/benchmark/agbenchmark/challenges/webarena_selection.json
@@ -334,7 +334,7 @@
                    "June: 13 orders",
                    "July: 9 orders",
                    "August: 8 orders",
-                    "Sepetember: 10 orders",
+                    "September: 10 orders",
                    "October: 4 orders",
                    "November: 5 orders",
                    "December: 10 orders"
@@ -343,7 +343,7 @@
            "reference_url": "",
            "program_html": [],
            "string_note": "",
-            "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders  "
+            "reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
        },
        "intent_template_id": 270,
        "string_note": null,