mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-18 14:34:23 +01:00
fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions
- Fixed `--mock` mode
- Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
- Implemented mock mode in `WebArenaChallenge`
- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted
- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`
- Disabled left-over Helicone code (see 056163e)
- Fixed a couple of challenge definitions
- WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
- synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list
- Added some debug logging in agent_api_interface.py and challenges/builtin.py
This commit is contained in:
@@ -23,6 +23,8 @@ async def run_api_agent(
|
||||
config: AgentBenchmarkConfig,
|
||||
timeout: int,
|
||||
artifacts_location: Optional[Path] = None,
|
||||
*,
|
||||
mock: bool = False,
|
||||
) -> AsyncIterator[Step]:
|
||||
configuration = Configuration(host=config.host)
|
||||
async with ApiClient(configuration) as api_client:
|
||||
@@ -36,26 +38,32 @@ async def run_api_agent(
|
||||
task_id = response.task_id
|
||||
|
||||
if artifacts_location:
|
||||
logger.debug("Uploading task input artifacts to agent...")
|
||||
await upload_artifacts(
|
||||
api_instance, artifacts_location, task_id, "artifacts_in"
|
||||
)
|
||||
|
||||
logger.debug("Running agent until finished or timeout...")
|
||||
while True:
|
||||
step = await api_instance.execute_agent_task_step(task_id=task_id)
|
||||
yield step
|
||||
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError("Time limit exceeded")
|
||||
if step and mock:
|
||||
step.is_last = True
|
||||
if not step or step.is_last:
|
||||
break
|
||||
|
||||
if artifacts_location:
|
||||
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
|
||||
if os.getenv("IS_MOCK"):
|
||||
if mock:
|
||||
logger.debug("Uploading mock artifacts to agent...")
|
||||
await upload_artifacts(
|
||||
api_instance, artifacts_location, task_id, "artifacts_out"
|
||||
)
|
||||
|
||||
logger.debug("Downloading agent artifacts...")
|
||||
await download_agent_artifacts_into_folder(
|
||||
api_instance, task_id, config.temp_folder
|
||||
)
|
||||
|
||||
@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
|
||||
|
||||
@classmethod
|
||||
async def run_challenge(
|
||||
cls, config: AgentBenchmarkConfig, timeout: int
|
||||
cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
|
||||
) -> AsyncIterator[Step]:
|
||||
"""
|
||||
Runs the challenge on the subject agent with the specified timeout.
|
||||
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
|
||||
logger.debug(f"Starting {cls.info.name} challenge run")
|
||||
i = 0
|
||||
async for step in run_api_agent(
|
||||
cls.info.task, config, timeout, cls.info.task_artifacts_dir
|
||||
cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
|
||||
):
|
||||
i += 1
|
||||
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
|
||||
|
||||
@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
|
||||
self,
|
||||
config: AgentBenchmarkConfig,
|
||||
request: pytest.FixtureRequest,
|
||||
i_attempt: int,
|
||||
i_attempt: int = 0,
|
||||
) -> None:
|
||||
if os.environ.get("HELICONE_API_KEY"):
|
||||
from helicone.lock import HeliconeLockManager
|
||||
# if os.environ.get("HELICONE_API_KEY"):
|
||||
# from helicone.lock import HeliconeLockManager
|
||||
|
||||
HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||
|
||||
timeout = self._spec.cutoff or 60
|
||||
|
||||
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
|
||||
task_id = ""
|
||||
timed_out = None
|
||||
try:
|
||||
async for step in self.run_challenge(config, timeout):
|
||||
async for step in self.run_challenge(
|
||||
config, timeout, mock=request.config.getoption("--mock")
|
||||
):
|
||||
if not task_id:
|
||||
task_id = step.task_id
|
||||
if request.config.getoption("--mock"):
|
||||
# Run only one step in mock mode
|
||||
break
|
||||
timed_out = False
|
||||
except TimeoutError:
|
||||
timed_out = True
|
||||
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
|
||||
|
||||
@classmethod
|
||||
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
|
||||
if cls._spec.task == "" and os.getenv("IS_MOCK"):
|
||||
yield EvalResult(
|
||||
result="This is a mock answer",
|
||||
result_source="step_output",
|
||||
score=1.0,
|
||||
passed=True,
|
||||
)
|
||||
return
|
||||
|
||||
result_ground = cls._spec.ground
|
||||
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
|
||||
|
||||
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
|
||||
passed=score > 0.9, # FIXME: arbitrary threshold
|
||||
)
|
||||
|
||||
if result_ground.eval.type in ("python", "pytest"):
|
||||
for py_file, output in outputs_for_eval:
|
||||
yield EvalResult(
|
||||
result=output,
|
||||
result_source=str(py_file),
|
||||
score=float(not output.startswith("Error:")),
|
||||
passed=not output.startswith("Error:"),
|
||||
)
|
||||
|
||||
if result_ground.eval.type == "llm":
|
||||
combined_results = "\n".join(output[1] for output in outputs_for_eval)
|
||||
llm_eval = cls.score_result_with_llm(combined_results, result_ground)
|
||||
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
|
||||
# Otherwise, it is a specific file
|
||||
matching_files = [os.path.join(script_dir, file_pattern)]
|
||||
|
||||
logger.debug(
|
||||
f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
|
||||
)
|
||||
|
||||
for file_path in matching_files:
|
||||
relative_file_path = Path(file_path).relative_to(workspace)
|
||||
logger.debug(
|
||||
f"Evaluating {relative_file_path} "
|
||||
f"(eval type: {ground.eval.type})..."
|
||||
)
|
||||
if ground.eval.type == "python":
|
||||
result = subprocess.run(
|
||||
[sys.executable, file_path],
|
||||
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
|
||||
text=True,
|
||||
)
|
||||
if "error" in result.stderr or result.returncode != 0:
|
||||
print(result.stderr)
|
||||
assert False, result.stderr
|
||||
yield (
|
||||
Path(file_path).relative_to(workspace),
|
||||
f"Output: {result.stdout}\n",
|
||||
)
|
||||
yield relative_file_path, f"Error: {result.stderr}\n"
|
||||
else:
|
||||
yield relative_file_path, f"Output: {result.stdout}\n"
|
||||
else:
|
||||
with open(file_path, "r") as f:
|
||||
yield Path(file_path).relative_to(workspace), f.read()
|
||||
yield relative_file_path, f.read()
|
||||
else:
|
||||
if ground.eval.type == "pytest":
|
||||
result = subprocess.run(
|
||||
@@ -317,8 +322,8 @@ class BuiltinChallenge(BaseChallenge):
|
||||
text=True,
|
||||
)
|
||||
if "error" in result.stderr or result.returncode != 0:
|
||||
print(result.stderr)
|
||||
assert False, result.stderr
|
||||
yield "pytest", f"Error: {result.stderr}\n"
|
||||
else:
|
||||
yield "pytest", f"Output: {result.stdout}\n"
|
||||
|
||||
@staticmethod
|
||||
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
|
||||
|
||||
@classmethod
|
||||
def score_result_with_llm(
|
||||
cls, content: str, ground: BuiltinChallengeSpec.Ground
|
||||
cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
|
||||
) -> float:
|
||||
if os.getenv("IS_MOCK"):
|
||||
if mock:
|
||||
return 1.0
|
||||
|
||||
# the validation for this is done in the Eval BaseModel
|
||||
|
||||
@@ -18,9 +18,7 @@
|
||||
"files": [
|
||||
"output.txt"
|
||||
],
|
||||
"should_contain": [
|
||||
""
|
||||
],
|
||||
"should_contain": [],
|
||||
"should_not_contain": []
|
||||
},
|
||||
"info": {
|
||||
|
||||
@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
|
||||
def evaluate_step_result(
|
||||
cls, step: Step, *, mock: bool = False
|
||||
) -> list[tuple[_Eval, EvalResult]]:
|
||||
if mock:
|
||||
step.output = cls.info.reference_answer
|
||||
assert step.output
|
||||
eval_results = cls.evaluate_answer(step.output)
|
||||
for eval in cls._spec.eval.evaluators:
|
||||
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
|
||||
self,
|
||||
config: AgentBenchmarkConfig,
|
||||
request: pytest.FixtureRequest,
|
||||
i_attempt: int,
|
||||
i_attempt: int = 0,
|
||||
) -> None:
|
||||
if os.environ.get("HELICONE_API_KEY"):
|
||||
from helicone.lock import HeliconeLockManager
|
||||
# if os.environ.get("HELICONE_API_KEY"):
|
||||
# from helicone.lock import HeliconeLockManager
|
||||
|
||||
HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
|
||||
|
||||
timeout = 120
|
||||
if request.config.getoption("--nc"):
|
||||
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
|
||||
timed_out = None
|
||||
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
||||
try:
|
||||
async for step in self.run_challenge(config, timeout):
|
||||
async for step in self.run_challenge(
|
||||
config, timeout, mock=request.config.getoption("--mock")
|
||||
):
|
||||
if not step.output:
|
||||
logger.warn(f"Step has no output: {step}")
|
||||
continue
|
||||
step_eval_results = self.evaluate_step_result(step)
|
||||
step_eval_results = self.evaluate_step_result(
|
||||
step, mock=request.config.getoption("--mock")
|
||||
)
|
||||
logger.debug(f"Intermediary results: {step_eval_results}")
|
||||
eval_results_per_step.append(step_eval_results)
|
||||
if step.is_last:
|
||||
|
||||
@@ -334,7 +334,7 @@
|
||||
"June: 13 orders",
|
||||
"July: 9 orders",
|
||||
"August: 8 orders",
|
||||
"Sepetember: 10 orders",
|
||||
"September: 10 orders",
|
||||
"October: 4 orders",
|
||||
"November: 5 orders",
|
||||
"December: 10 orders"
|
||||
@@ -343,7 +343,7 @@
|
||||
"reference_url": "",
|
||||
"program_html": [],
|
||||
"string_note": "",
|
||||
"reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
|
||||
"reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
|
||||
},
|
||||
"intent_template_id": 270,
|
||||
"string_note": null,
|
||||
|
||||
Reference in New Issue
Block a user