Replace hidden files with custom python (#99)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2026-02-19 13:14:26 +01:00 · 2023-07-14 14:39:47 -07:00
parent a9702e4629
commit 7bc7d9213d
6 changed files with 23 additions and 28 deletions
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -50,7 +50,7 @@ class Challenge(ABC):
        # We copy them in the workspace to make it easy to import the code produced by the agent

        copy_artifacts_into_workspace(
-            config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
+            config["workspace"], "custom_python", self.CHALLENGE_LOCATION
        )

    def test_method(self, config: Dict[str, Any]) -> None:
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -57,8 +57,7 @@ This folder contains all the files you want the agent to have in its workspace B
 This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
 This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.

-### hidden_files
-This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
-For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
-This allows us to run this test.py and easily import code generated by the agent.
-For example see: TestBasicCodeGeneration challenge.
+### custom_python
+This folder contains files that will be copied into the agent's workspace and run after the challenge is completed.
+For example we can have a test.py in it and run this file in the workspace to easily import code generated by the agent.
+Example: TestBasicCodeGeneration challenge.
--- a/agbenchmark/challenges/code/d3/custom_python/api_tests.py
+++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
@@ -5,6 +5,15 @@ from unittest.mock import Mock, patch
 import requests


+def test_make_request_and_assert() -> None:
+    result = make_request_and_assert()
+    print(result)
+    expected_result = {"status": "OK"}
+    error_message = f"AssertionError: Expected the output to be {expected_result}"
+    print(error_message)
+    assert result == expected_result, error_message
+
+
 def make_assertion() -> None:
    if os.environ.get("MOCK_TEST", "False").lower() == "true":
        mock_response = Mock(requests.Response)
@@ -25,3 +34,8 @@ def make_request_and_assert() -> Dict[str, Any]:
        )

    return response.json()
+
+
+if __name__ == "__main__":
+    # test for the case when server is healthy
+    test_make_request_and_assert()
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -6,9 +6,9 @@
  "ground": {
    "answer": "GET localhost:8079/health responds with a 200 OK",
    "should_contain": [],
-    "should_not_contain": [],
-    "files": [],
-    "type": "custom_python"
+    "should_not_contain": ["AssertionError"],
+    "files": ["test.py"],
+    "type": "execute_python_code"
  },
  "info": {
    "difficulty": "advanced",
--- a/agbenchmark/challenges/code/d4/custom_python/test.py
+++ b/agbenchmark/challenges/code/d4/custom_python/test.py
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -2,8 +2,6 @@ import glob
 import importlib
 import json
 import os
-import pkgutil
-import sys
 import types
 from pathlib import Path
 from typing import Any, Dict
@@ -60,23 +58,7 @@ def generate_tests() -> None:

            scores = self.get_scores(config)

-            # Check if make_assertion is defined and use it
-            if self.data.ground.type == "custom_python":
-                custom_python_location = (
-                    f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python"
-                )
-                sys.path.append(str(custom_python_location))
-
-                for module_loader, name, ispkg in pkgutil.iter_modules(
-                    [str(custom_python_location)]
-                ):
-                    module = importlib.import_module(name)
-
-                    if hasattr(module, "make_assertion"):
-                        make_assertion = getattr(module, "make_assertion")
-                        make_assertion()
-            else:
-                assert 1 in scores
+            assert 1 in scores

        # Parametrize the method here
        test_method = pytest.mark.parametrize(