mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-18 14:34:23 +01:00
fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions
- Fixed `--mock` mode
- Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
- Implemented mock mode in `WebArenaChallenge`
- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted
- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`
- Disabled left-over Helicone code (see 056163e)
- Fixed a couple of challenge definitions
- WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
- synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list
- Added some debug logging in agent_api_interface.py and challenges/builtin.py
This commit is contained in:
@@ -23,6 +23,8 @@ async def run_api_agent(
|
|||||||
config: AgentBenchmarkConfig,
|
config: AgentBenchmarkConfig,
|
||||||
timeout: int,
|
timeout: int,
|
||||||
artifacts_location: Optional[Path] = None,
|
artifacts_location: Optional[Path] = None,
|
||||||
|
*,
|
||||||
|
mock: bool = False,
|
||||||
) -> AsyncIterator[Step]:
|
) -> AsyncIterator[Step]:
|
||||||
configuration = Configuration(host=config.host)
|
configuration = Configuration(host=config.host)
|
||||||
async with ApiClient(configuration) as api_client:
|
async with ApiClient(configuration) as api_client:
|
||||||
@@ -36,26 +38,32 @@ async def run_api_agent(
|
|||||||
task_id = response.task_id
|
task_id = response.task_id
|
||||||
|
|
||||||
if artifacts_location:
|
if artifacts_location:
|
||||||
|
logger.debug("Uploading task input artifacts to agent...")
|
||||||
await upload_artifacts(
|
await upload_artifacts(
|
||||||
api_instance, artifacts_location, task_id, "artifacts_in"
|
api_instance, artifacts_location, task_id, "artifacts_in"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.debug("Running agent until finished or timeout...")
|
||||||
while True:
|
while True:
|
||||||
step = await api_instance.execute_agent_task_step(task_id=task_id)
|
step = await api_instance.execute_agent_task_step(task_id=task_id)
|
||||||
yield step
|
yield step
|
||||||
|
|
||||||
if time.time() - start_time > timeout:
|
if time.time() - start_time > timeout:
|
||||||
raise TimeoutError("Time limit exceeded")
|
raise TimeoutError("Time limit exceeded")
|
||||||
|
if step and mock:
|
||||||
|
step.is_last = True
|
||||||
if not step or step.is_last:
|
if not step or step.is_last:
|
||||||
break
|
break
|
||||||
|
|
||||||
if artifacts_location:
|
if artifacts_location:
|
||||||
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
|
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
|
||||||
if os.getenv("IS_MOCK"):
|
if mock:
|
||||||
|
logger.debug("Uploading mock artifacts to agent...")
|
||||||
await upload_artifacts(
|
await upload_artifacts(
|
||||||
api_instance, artifacts_location, task_id, "artifacts_out"
|
api_instance, artifacts_location, task_id, "artifacts_out"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.debug("Downloading agent artifacts...")
|
||||||
await download_agent_artifacts_into_folder(
|
await download_agent_artifacts_into_folder(
|
||||||
api_instance, task_id, config.temp_folder
|
api_instance, task_id, config.temp_folder
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def run_challenge(
|
async def run_challenge(
|
||||||
cls, config: AgentBenchmarkConfig, timeout: int
|
cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
|
||||||
) -> AsyncIterator[Step]:
|
) -> AsyncIterator[Step]:
|
||||||
"""
|
"""
|
||||||
Runs the challenge on the subject agent with the specified timeout.
|
Runs the challenge on the subject agent with the specified timeout.
|
||||||
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
|
|||||||
logger.debug(f"Starting {cls.info.name} challenge run")
|
logger.debug(f"Starting {cls.info.name} challenge run")
|
||||||
i = 0
|
i = 0
|
||||||
async for step in run_api_agent(
|
async for step in run_api_agent(
|
||||||
cls.info.task, config, timeout, cls.info.task_artifacts_dir
|
cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
|
||||||
):
|
):
|
||||||
i += 1
|
i += 1
|
||||||
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
|
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
|
||||||
|
|||||||
@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
self,
|
self,
|
||||||
config: AgentBenchmarkConfig,
|
config: AgentBenchmarkConfig,
|
||||||
request: pytest.FixtureRequest,
|
request: pytest.FixtureRequest,
|
||||||
i_attempt: int,
|
i_attempt: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
if os.environ.get("HELICONE_API_KEY"):
|
# if os.environ.get("HELICONE_API_KEY"):
|
||||||
from helicone.lock import HeliconeLockManager
|
# from helicone.lock import HeliconeLockManager
|
||||||
|
|
||||||
HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||||
|
|
||||||
timeout = self._spec.cutoff or 60
|
timeout = self._spec.cutoff or 60
|
||||||
|
|
||||||
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
task_id = ""
|
task_id = ""
|
||||||
timed_out = None
|
timed_out = None
|
||||||
try:
|
try:
|
||||||
async for step in self.run_challenge(config, timeout):
|
async for step in self.run_challenge(
|
||||||
|
config, timeout, mock=request.config.getoption("--mock")
|
||||||
|
):
|
||||||
if not task_id:
|
if not task_id:
|
||||||
task_id = step.task_id
|
task_id = step.task_id
|
||||||
if request.config.getoption("--mock"):
|
|
||||||
# Run only one step in mock mode
|
|
||||||
break
|
|
||||||
timed_out = False
|
timed_out = False
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
timed_out = True
|
timed_out = True
|
||||||
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
|
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
|
||||||
if cls._spec.task == "" and os.getenv("IS_MOCK"):
|
|
||||||
yield EvalResult(
|
|
||||||
result="This is a mock answer",
|
|
||||||
result_source="step_output",
|
|
||||||
score=1.0,
|
|
||||||
passed=True,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
result_ground = cls._spec.ground
|
result_ground = cls._spec.ground
|
||||||
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
|
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
|
||||||
|
|
||||||
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
passed=score > 0.9, # FIXME: arbitrary threshold
|
passed=score > 0.9, # FIXME: arbitrary threshold
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if result_ground.eval.type in ("python", "pytest"):
|
||||||
|
for py_file, output in outputs_for_eval:
|
||||||
|
yield EvalResult(
|
||||||
|
result=output,
|
||||||
|
result_source=str(py_file),
|
||||||
|
score=float(not output.startswith("Error:")),
|
||||||
|
passed=not output.startswith("Error:"),
|
||||||
|
)
|
||||||
|
|
||||||
if result_ground.eval.type == "llm":
|
if result_ground.eval.type == "llm":
|
||||||
combined_results = "\n".join(output[1] for output in outputs_for_eval)
|
combined_results = "\n".join(output[1] for output in outputs_for_eval)
|
||||||
llm_eval = cls.score_result_with_llm(combined_results, result_ground)
|
llm_eval = cls.score_result_with_llm(combined_results, result_ground)
|
||||||
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
# Otherwise, it is a specific file
|
# Otherwise, it is a specific file
|
||||||
matching_files = [os.path.join(script_dir, file_pattern)]
|
matching_files = [os.path.join(script_dir, file_pattern)]
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
|
||||||
|
)
|
||||||
|
|
||||||
for file_path in matching_files:
|
for file_path in matching_files:
|
||||||
|
relative_file_path = Path(file_path).relative_to(workspace)
|
||||||
|
logger.debug(
|
||||||
|
f"Evaluating {relative_file_path} "
|
||||||
|
f"(eval type: {ground.eval.type})..."
|
||||||
|
)
|
||||||
if ground.eval.type == "python":
|
if ground.eval.type == "python":
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, file_path],
|
[sys.executable, file_path],
|
||||||
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
text=True,
|
text=True,
|
||||||
)
|
)
|
||||||
if "error" in result.stderr or result.returncode != 0:
|
if "error" in result.stderr or result.returncode != 0:
|
||||||
print(result.stderr)
|
yield relative_file_path, f"Error: {result.stderr}\n"
|
||||||
assert False, result.stderr
|
else:
|
||||||
yield (
|
yield relative_file_path, f"Output: {result.stdout}\n"
|
||||||
Path(file_path).relative_to(workspace),
|
|
||||||
f"Output: {result.stdout}\n",
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
with open(file_path, "r") as f:
|
with open(file_path, "r") as f:
|
||||||
yield Path(file_path).relative_to(workspace), f.read()
|
yield relative_file_path, f.read()
|
||||||
else:
|
else:
|
||||||
if ground.eval.type == "pytest":
|
if ground.eval.type == "pytest":
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
@@ -317,8 +322,8 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
text=True,
|
text=True,
|
||||||
)
|
)
|
||||||
if "error" in result.stderr or result.returncode != 0:
|
if "error" in result.stderr or result.returncode != 0:
|
||||||
print(result.stderr)
|
yield "pytest", f"Error: {result.stderr}\n"
|
||||||
assert False, result.stderr
|
else:
|
||||||
yield "pytest", f"Output: {result.stdout}\n"
|
yield "pytest", f"Output: {result.stdout}\n"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def score_result_with_llm(
|
def score_result_with_llm(
|
||||||
cls, content: str, ground: BuiltinChallengeSpec.Ground
|
cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
|
||||||
) -> float:
|
) -> float:
|
||||||
if os.getenv("IS_MOCK"):
|
if mock:
|
||||||
return 1.0
|
return 1.0
|
||||||
|
|
||||||
# the validation for this is done in the Eval BaseModel
|
# the validation for this is done in the Eval BaseModel
|
||||||
|
|||||||
@@ -18,9 +18,7 @@
|
|||||||
"files": [
|
"files": [
|
||||||
"output.txt"
|
"output.txt"
|
||||||
],
|
],
|
||||||
"should_contain": [
|
"should_contain": [],
|
||||||
""
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
"should_not_contain": []
|
||||||
},
|
},
|
||||||
"info": {
|
"info": {
|
||||||
|
|||||||
@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
|
def evaluate_step_result(
|
||||||
|
cls, step: Step, *, mock: bool = False
|
||||||
|
) -> list[tuple[_Eval, EvalResult]]:
|
||||||
|
if mock:
|
||||||
|
step.output = cls.info.reference_answer
|
||||||
assert step.output
|
assert step.output
|
||||||
eval_results = cls.evaluate_answer(step.output)
|
eval_results = cls.evaluate_answer(step.output)
|
||||||
for eval in cls._spec.eval.evaluators:
|
for eval in cls._spec.eval.evaluators:
|
||||||
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
|
|||||||
self,
|
self,
|
||||||
config: AgentBenchmarkConfig,
|
config: AgentBenchmarkConfig,
|
||||||
request: pytest.FixtureRequest,
|
request: pytest.FixtureRequest,
|
||||||
i_attempt: int,
|
i_attempt: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
if os.environ.get("HELICONE_API_KEY"):
|
# if os.environ.get("HELICONE_API_KEY"):
|
||||||
from helicone.lock import HeliconeLockManager
|
# from helicone.lock import HeliconeLockManager
|
||||||
|
|
||||||
HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||||
|
|
||||||
timeout = 120
|
timeout = 120
|
||||||
if request.config.getoption("--nc"):
|
if request.config.getoption("--nc"):
|
||||||
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
|
|||||||
timed_out = None
|
timed_out = None
|
||||||
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
||||||
try:
|
try:
|
||||||
async for step in self.run_challenge(config, timeout):
|
async for step in self.run_challenge(
|
||||||
|
config, timeout, mock=request.config.getoption("--mock")
|
||||||
|
):
|
||||||
if not step.output:
|
if not step.output:
|
||||||
logger.warn(f"Step has no output: {step}")
|
logger.warn(f"Step has no output: {step}")
|
||||||
continue
|
continue
|
||||||
step_eval_results = self.evaluate_step_result(step)
|
step_eval_results = self.evaluate_step_result(
|
||||||
|
step, mock=request.config.getoption("--mock")
|
||||||
|
)
|
||||||
logger.debug(f"Intermediary results: {step_eval_results}")
|
logger.debug(f"Intermediary results: {step_eval_results}")
|
||||||
eval_results_per_step.append(step_eval_results)
|
eval_results_per_step.append(step_eval_results)
|
||||||
if step.is_last:
|
if step.is_last:
|
||||||
|
|||||||
@@ -334,7 +334,7 @@
|
|||||||
"June: 13 orders",
|
"June: 13 orders",
|
||||||
"July: 9 orders",
|
"July: 9 orders",
|
||||||
"August: 8 orders",
|
"August: 8 orders",
|
||||||
"Sepetember: 10 orders",
|
"September: 10 orders",
|
||||||
"October: 4 orders",
|
"October: 4 orders",
|
||||||
"November: 5 orders",
|
"November: 5 orders",
|
||||||
"December: 10 orders"
|
"December: 10 orders"
|
||||||
@@ -343,7 +343,7 @@
|
|||||||
"reference_url": "",
|
"reference_url": "",
|
||||||
"program_html": [],
|
"program_html": [],
|
||||||
"string_note": "",
|
"string_note": "",
|
||||||
"reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
|
"reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
|
||||||
},
|
},
|
||||||
"intent_template_id": 270,
|
"intent_template_id": 270,
|
||||||
"string_note": null,
|
"string_note": null,
|
||||||
|
|||||||
Reference in New Issue
Block a user