Ability to run by categories (#5229)

* Ability to run by categories Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> * always use Path.cwd() Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> --------- Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2025-12-17 22:14:28 +01:00 · 2023-09-15 20:04:12 -07:00
parent 688cd52be2
commit 295702867a
24 changed files with 200 additions and 476 deletions
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -10,8 +10,11 @@ from typing import Any, Dict, Optional

 import pytest

+from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN, UPDATES_JSON_PATH
+from agbenchmark.agent_api_interface import append_updates_file
 from agbenchmark.utils.challenge import Challenge
 from agbenchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData
+from agent_protocol_client.models.step import Step

 DATA_CATEGORY = {}

@@ -48,7 +51,7 @@ def create_single_test(
        test_name = self.data.name

        try:
-            with open("challenges_already_beaten.json", "r") as f:
+            with open(CHALLENGES_ALREADY_BEATEN, "r") as f:
                challenges_beaten_in_the_past = json.load(f)
        except:
            challenges_beaten_in_the_past = {}
@@ -82,7 +85,24 @@ def create_single_test(
        )
        del scores["answers"]  # remove answers from scores
        request.node.scores = scores  # store scores in request.node
-        assert 1 in scores["values"]
+        is_score_100 = 1 in scores["values"]
+
+        evaluation = "Correct!" if is_score_100 else "Incorrect."
+        eval_step = Step(
+            input=evaluation,
+            additional_input=None,
+            task_id="irrelevant, this step is a hack",
+            step_id="irrelevant, this step is a hack",
+            name="",
+            status="created",
+            output=None,
+            additional_output=None,
+            artifacts=[],
+            is_last=True,
+        )
+        await append_updates_file(eval_step)
+
+        assert is_score_100

    # Parametrize the method here
    test_method = pytest.mark.parametrize(
@@ -194,4 +214,18 @@ def challenge_should_be_ignored(json_file):
    return "challenges/deprecated" in json_file or "challenges/library" in json_file


+def initialize_updates_file():
+    if os.path.exists(UPDATES_JSON_PATH):
+        # If the file already exists, overwrite it with an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Initialized updates.json by overwriting with an empty array")
+    else:
+        # If the file doesn't exist, create it and write an empty list
+        with open(UPDATES_JSON_PATH, "w") as file:
+            json.dump([], file, indent=2)
+        print("Created updates.json and initialized it with an empty array")
+
+
+initialize_updates_file()
 generate_tests()