diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e926386..b7864db6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,6 +141,10 @@ jobs: agbenchmark start --improve --mock agbenchmark start --mock agbenchmark start --mock --category=retrieval + agbenchmark start --mock --category=interface + agbenchmark start --mock --category=code + agbenchmark start --mock --category=memory + agbenchmark start --mock --category=iterate else curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start agbenchmark start --maintain diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index aeebd7ad..874fd45b 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -46,6 +46,13 @@ class Challenge(ABC): run_agent(self.task, config, self.CHALLENGE_LOCATION) + # hidden files are added after the agent runs. Hidden files can be python test files. + # We copy them in the workspace to make it easy to import the code produced by the agent + + copy_artifacts_into_workspace( + config["workspace"], "hidden_files", self.CHALLENGE_LOCATION + ) + def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 305cd28f..a890c9d3 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -46,3 +46,19 @@ Example: Current Output: - **score** (float): scores range from [0, 1] + +## Add files to challenges: + +### artifacts_in + +This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts + +### artifacts_out +This folder contains all the files you would like the agent to generate. This folder is used to mock the agent. +This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works. + +### hidden_files +This folder contains files hidden from the agent but useful to assess whether a challenge is successful. +For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge. +This allows us to run this test.py and easily import code generated by the agent. +For example see: TestBasicCodeGeneration challenge. diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py new file mode 100644 index 00000000..de3d8c62 --- /dev/null +++ b/agbenchmark/challenges/code/d4/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json new file mode 100644 index 00000000..51f6f270 --- /dev/null +++ b/agbenchmark/challenges/code/d4/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestBasicCodeGeneration", + "category": ["code", "iterate"], + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "dependencies": ["TestWriteFile"], + "ground": { + "answer": "The two_sum function coded properly.", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "novice", + "description": "Tests ability for the agent to create the two_sum function.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py new file mode 100644 index 00000000..d85d1353 --- /dev/null +++ b/agbenchmark/challenges/code/d4/hidden_files/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 521d626c..bca191cd 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6 +Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea diff --git a/pyproject.toml b/pyproject.toml index b0526ab5..48be9cf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,10 +38,10 @@ testpaths = [ ] markers = [ "retrieval", - "regression", "interface", "code", - "memory" + "memory", + "iterate" ] [tool.poetry.scripts]