mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-05 07:14:27 +01:00
Add basic code generation challenge (#98)
This commit is contained in:
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@@ -141,6 +141,10 @@ jobs:
|
||||
agbenchmark start --improve --mock
|
||||
agbenchmark start --mock
|
||||
agbenchmark start --mock --category=retrieval
|
||||
agbenchmark start --mock --category=interface
|
||||
agbenchmark start --mock --category=code
|
||||
agbenchmark start --mock --category=memory
|
||||
agbenchmark start --mock --category=iterate
|
||||
else
|
||||
curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
|
||||
agbenchmark start --maintain
|
||||
|
||||
@@ -46,6 +46,13 @@ class Challenge(ABC):
|
||||
|
||||
run_agent(self.task, config, self.CHALLENGE_LOCATION)
|
||||
|
||||
# hidden files are added after the agent runs. Hidden files can be python test files.
|
||||
# We copy them in the workspace to make it easy to import the code produced by the agent
|
||||
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
|
||||
)
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -46,3 +46,19 @@ Example:
|
||||
Current Output:
|
||||
|
||||
- **score** (float): scores range from [0, 1]
|
||||
|
||||
## Add files to challenges:
|
||||
|
||||
### artifacts_in
|
||||
|
||||
This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts
|
||||
|
||||
### artifacts_out
|
||||
This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
|
||||
This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.
|
||||
|
||||
### hidden_files
|
||||
This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
|
||||
For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
|
||||
This allows us to run this test.py and easily import code generated by the agent.
|
||||
For example see: TestBasicCodeGeneration challenge.
|
||||
|
||||
12
agbenchmark/challenges/code/d4/artifacts_out/code.py
Normal file
12
agbenchmark/challenges/code/d4/artifacts_out/code.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# mypy: ignore-errors
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
||||
seen = {}
|
||||
for i, num in enumerate(nums):
|
||||
complement = target - num
|
||||
if complement in seen:
|
||||
return [seen[complement], i]
|
||||
seen[num] = i
|
||||
return None
|
||||
18
agbenchmark/challenges/code/d4/data.json
Normal file
18
agbenchmark/challenges/code/d4/data.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "TestBasicCodeGeneration",
|
||||
"category": ["code", "iterate"],
|
||||
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
"ground": {
|
||||
"answer": "The two_sum function coded properly.",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
"should_not_contain": [],
|
||||
"files": ["test.py"],
|
||||
"type": "execute_python_code"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "novice",
|
||||
"description": "Tests ability for the agent to create the two_sum function.",
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
||||
31
agbenchmark/challenges/code/d4/hidden_files/test.py
Normal file
31
agbenchmark/challenges/code/d4/hidden_files/test.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# mypy: ignore-errors
|
||||
from code import two_sum
|
||||
from typing import List
|
||||
|
||||
|
||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
||||
result = two_sum(nums, target)
|
||||
print(result)
|
||||
assert (
|
||||
result == expected_result
|
||||
), f"AssertionError: Expected the output to be {expected_result}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test the trivial case with the first two numbers
|
||||
nums = [2, 7, 11, 15]
|
||||
target = 9
|
||||
expected_result = [0, 1]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for ability to use zero and the same number twice
|
||||
nums = [2, 7, 0, 15, 12, 0]
|
||||
target = 0
|
||||
expected_result = [2, 5]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for first and last index usage and negative numbers
|
||||
nums = [-6, 7, 11, 4]
|
||||
target = -2
|
||||
expected_result = [0, 3]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
Submodule agent/gpt-engineer updated: 521d626c00...bca191cd76
@@ -38,10 +38,10 @@ testpaths = [
|
||||
]
|
||||
markers = [
|
||||
"retrieval",
|
||||
"regression",
|
||||
"interface",
|
||||
"code",
|
||||
"memory"
|
||||
"memory",
|
||||
"iterate"
|
||||
]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
|
||||
Reference in New Issue
Block a user