mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-08 16:54:31 +01:00
Add 'Debug simple typo with guidance' challenge (#65)
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
This commit is contained in:
8
agbenchmark/challenges/code/code.py
Normal file
8
agbenchmark/challenges/code/code.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import pytest
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
@pytest.mark.code
|
||||
class CodeChallenge(Challenge):
|
||||
"""Challenge for memory"""
|
||||
13
agbenchmark/challenges/code/d1/artifacts_in/code.py
Normal file
13
agbenchmark/challenges/code/d1/artifacts_in/code.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# mypy: ignore-errors
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
||||
seen = {}
|
||||
for i, num in enumerate(nums):
|
||||
typo
|
||||
complement = target - num
|
||||
if complement in seen:
|
||||
return [seen[complement], i]
|
||||
seen[num] = i
|
||||
return None
|
||||
31
agbenchmark/challenges/code/d1/artifacts_in/test.py
Normal file
31
agbenchmark/challenges/code/d1/artifacts_in/test.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# mypy: ignore-errors
|
||||
from code import two_sum
|
||||
from typing import List
|
||||
|
||||
|
||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
||||
result = two_sum(nums, target)
|
||||
print(result)
|
||||
assert (
|
||||
result == expected_result
|
||||
), f"AssertionError: Expected the output to be {expected_result}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test the trivial case with the first two numbers
|
||||
nums = [2, 7, 11, 15]
|
||||
target = 9
|
||||
expected_result = [0, 1]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for ability to use zero and the same number twice
|
||||
nums = [2, 7, 0, 15, 12, 0]
|
||||
target = 0
|
||||
expected_result = [2, 5]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for first and last index usage and negative numbers
|
||||
nums = [-6, 7, 11, 4]
|
||||
target = -2
|
||||
expected_result = [0, 3]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
12
agbenchmark/challenges/code/d1/artifacts_out/code.py
Normal file
12
agbenchmark/challenges/code/d1/artifacts_out/code.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# mypy: ignore-errors
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
||||
seen = {}
|
||||
for i, num in enumerate(nums):
|
||||
complement = target - num
|
||||
if complement in seen:
|
||||
return [seen[complement], i]
|
||||
seen[num] = i
|
||||
return None
|
||||
31
agbenchmark/challenges/code/d1/artifacts_out/test.py
Normal file
31
agbenchmark/challenges/code/d1/artifacts_out/test.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# mypy: ignore-errors
|
||||
from code import two_sum
|
||||
from typing import List
|
||||
|
||||
|
||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
||||
result = two_sum(nums, target)
|
||||
print(result)
|
||||
assert (
|
||||
result == expected_result
|
||||
), f"AssertionError: Expected the output to be {expected_result}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test the trivial case with the first two numbers
|
||||
nums = [2, 7, 11, 15]
|
||||
target = 9
|
||||
expected_result = [0, 1]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for ability to use zero and the same number twice
|
||||
nums = [2, 7, 0, 15, 12, 0]
|
||||
target = 0
|
||||
expected_result = [2, 5]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for first and last index usage and negative numbers
|
||||
nums = [-6, 7, 11, 4]
|
||||
target = -2
|
||||
expected_result = [0, 3]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "debug_simple_typo_with_guidance",
|
||||
"category": ["code"],
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"dependencies": [],
|
||||
"ground": {
|
||||
"answer": "2314",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
"should_not_contain": [],
|
||||
"files": ["test.py"],
|
||||
"type": "execute_python_code"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": null,
|
||||
"mock_task": null
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from agbenchmark.challenges.code.code import CodeChallenge
|
||||
|
||||
|
||||
class TestDebugSimpleTypoWithGuidance(CodeChallenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def get_file_path(self) -> str: # all tests must implement this method
|
||||
return os.path.join(
|
||||
os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
|
||||
)
|
||||
|
||||
@pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
score = self.scoring(file_content, self.data.ground)
|
||||
print("Your score is:", score)
|
||||
scores.append(score)
|
||||
|
||||
assert 1 in scores
|
||||
Reference in New Issue
Block a user