fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions

- Fixed `--mock` mode
   - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
   - Implemented mock mode in `WebArenaChallenge`

- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted

- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`

- Disabled left-over Helicone code (see 056163e)

- Fixed a couple of challenge definitions
   - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
   - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list

- Added some debug logging in agent_api_interface.py and challenges/builtin.py
This commit is contained in:
Reinier van der Leer
2024-02-14 01:05:34 +01:00
parent bb7f5abc6c
commit 327fb1f916
6 changed files with 63 additions and 44 deletions

View File

@@ -23,6 +23,8 @@ async def run_api_agent(
config: AgentBenchmarkConfig,
timeout: int,
artifacts_location: Optional[Path] = None,
*,
mock: bool = False,
) -> AsyncIterator[Step]:
configuration = Configuration(host=config.host)
async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
task_id = response.task_id
if artifacts_location:
logger.debug("Uploading task input artifacts to agent...")
await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_in"
)
logger.debug("Running agent until finished or timeout...")
while True:
step = await api_instance.execute_agent_task_step(task_id=task_id)
yield step
if time.time() - start_time > timeout:
raise TimeoutError("Time limit exceeded")
if step and mock:
step.is_last = True
if not step or step.is_last:
break
if artifacts_location:
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
if os.getenv("IS_MOCK"):
if mock:
logger.debug("Uploading mock artifacts to agent...")
await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_out"
)
logger.debug("Downloading agent artifacts...")
await download_agent_artifacts_into_folder(
api_instance, task_id, config.temp_folder
)

View File

@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
@classmethod
async def run_challenge(
cls, config: AgentBenchmarkConfig, timeout: int
cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
) -> AsyncIterator[Step]:
"""
Runs the challenge on the subject agent with the specified timeout.
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
logger.debug(f"Starting {cls.info.name} challenge run")
i = 0
async for step in run_api_agent(
cls.info.task, config, timeout, cls.info.task_artifacts_dir
cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
):
i += 1
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")

View File

@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
i_attempt: int,
i_attempt: int = 0,
) -> None:
if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager
# if os.environ.get("HELICONE_API_KEY"):
# from helicone.lock import HeliconeLockManager
HeliconeLockManager.write_custom_property("challenge", self.info.name)
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = self._spec.cutoff or 60
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
task_id = ""
timed_out = None
try:
async for step in self.run_challenge(config, timeout):
async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
):
if not task_id:
task_id = step.task_id
if request.config.getoption("--mock"):
# Run only one step in mock mode
break
timed_out = False
except TimeoutError:
timed_out = True
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
@classmethod
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
if cls._spec.task == "" and os.getenv("IS_MOCK"):
yield EvalResult(
result="This is a mock answer",
result_source="step_output",
score=1.0,
passed=True,
)
return
result_ground = cls._spec.ground
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
passed=score > 0.9, # FIXME: arbitrary threshold
)
if result_ground.eval.type in ("python", "pytest"):
for py_file, output in outputs_for_eval:
yield EvalResult(
result=output,
result_source=str(py_file),
score=float(not output.startswith("Error:")),
passed=not output.startswith("Error:"),
)
if result_ground.eval.type == "llm":
combined_results = "\n".join(output[1] for output in outputs_for_eval)
llm_eval = cls.score_result_with_llm(combined_results, result_ground)
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
# Otherwise, it is a specific file
matching_files = [os.path.join(script_dir, file_pattern)]
logger.debug(
f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
)
for file_path in matching_files:
relative_file_path = Path(file_path).relative_to(workspace)
logger.debug(
f"Evaluating {relative_file_path} "
f"(eval type: {ground.eval.type})..."
)
if ground.eval.type == "python":
result = subprocess.run(
[sys.executable, file_path],
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
text=True,
)
if "error" in result.stderr or result.returncode != 0:
print(result.stderr)
assert False, result.stderr
yield (
Path(file_path).relative_to(workspace),
f"Output: {result.stdout}\n",
)
yield relative_file_path, f"Error: {result.stderr}\n"
else:
yield relative_file_path, f"Output: {result.stdout}\n"
else:
with open(file_path, "r") as f:
yield Path(file_path).relative_to(workspace), f.read()
yield relative_file_path, f.read()
else:
if ground.eval.type == "pytest":
result = subprocess.run(
@@ -317,8 +322,8 @@ class BuiltinChallenge(BaseChallenge):
text=True,
)
if "error" in result.stderr or result.returncode != 0:
print(result.stderr)
assert False, result.stderr
yield "pytest", f"Error: {result.stderr}\n"
else:
yield "pytest", f"Output: {result.stdout}\n"
@staticmethod
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
@classmethod
def score_result_with_llm(
cls, content: str, ground: BuiltinChallengeSpec.Ground
cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
) -> float:
if os.getenv("IS_MOCK"):
if mock:
return 1.0
# the validation for this is done in the Eval BaseModel

View File

@@ -18,9 +18,7 @@
"files": [
"output.txt"
],
"should_contain": [
""
],
"should_contain": [],
"should_not_contain": []
},
"info": {

View File

@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
return results
@classmethod
def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
def evaluate_step_result(
cls, step: Step, *, mock: bool = False
) -> list[tuple[_Eval, EvalResult]]:
if mock:
step.output = cls.info.reference_answer
assert step.output
eval_results = cls.evaluate_answer(step.output)
for eval in cls._spec.eval.evaluators:
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
i_attempt: int,
i_attempt: int = 0,
) -> None:
if os.environ.get("HELICONE_API_KEY"):
from helicone.lock import HeliconeLockManager
# if os.environ.get("HELICONE_API_KEY"):
# from helicone.lock import HeliconeLockManager
HeliconeLockManager.write_custom_property("challenge", self.info.name)
# HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = 120
if request.config.getoption("--nc"):
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
timed_out = None
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try:
async for step in self.run_challenge(config, timeout):
async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
):
if not step.output:
logger.warn(f"Step has no output: {step}")
continue
step_eval_results = self.evaluate_step_result(step)
step_eval_results = self.evaluate_step_result(
step, mock=request.config.getoption("--mock")
)
logger.debug(f"Intermediary results: {step_eval_results}")
eval_results_per_step.append(step_eval_results)
if step.is_last:

View File

@@ -334,7 +334,7 @@
"June: 13 orders",
"July: 9 orders",
"August: 8 orders",
"Sepetember: 10 orders",
"September: 10 orders",
"October: 4 orders",
"November: 5 orders",
"December: 10 orders"
@@ -343,7 +343,7 @@
"reference_url": "",
"program_html": [],
"string_note": "",
"reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
"reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
},
"intent_template_id": 270,
"string_note": null,