mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-17 22:14:28 +01:00
fix(benchmark): Mock mode, python evals, --attempts flag, challenge definitions
- Fixed `--mock` mode
- Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
- Implemented mock mode in `WebArenaChallenge`
- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted
- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`
- Disabled left-over Helicone code (see 056163e)
- Fixed a couple of challenge definitions
- WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
- synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list
- Added some debug logging in agent_api_interface.py and challenges/builtin.py
This commit is contained in:
@@ -23,6 +23,8 @@ async def run_api_agent(
|
||||
config: AgentBenchmarkConfig,
|
||||
timeout: int,
|
||||
artifacts_location: Optional[Path] = None,
|
||||
*,
|
||||
mock: bool = False,
|
||||
) -> AsyncIterator[Step]:
|
||||
configuration = Configuration(host=config.host)
|
||||
async with ApiClient(configuration) as api_client:
|
||||
@@ -36,26 +38,32 @@ async def run_api_agent(
|
||||
task_id = response.task_id
|
||||
|
||||
if artifacts_location:
|
||||
logger.debug("Uploading task input artifacts to agent...")
|
||||
await upload_artifacts(
|
||||
api_instance, artifacts_location, task_id, "artifacts_in"
|
||||
)
|
||||
|
||||
logger.debug("Running agent until finished or timeout...")
|
||||
while True:
|
||||
step = await api_instance.execute_agent_task_step(task_id=task_id)
|
||||
yield step
|
||||
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError("Time limit exceeded")
|
||||
if step and mock:
|
||||
step.is_last = True
|
||||
if not step or step.is_last:
|
||||
break
|
||||
|
||||
if artifacts_location:
|
||||
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
|
||||
if os.getenv("IS_MOCK"):
|
||||
if mock:
|
||||
logger.debug("Uploading mock artifacts to agent...")
|
||||
await upload_artifacts(
|
||||
api_instance, artifacts_location, task_id, "artifacts_out"
|
||||
)
|
||||
|
||||
logger.debug("Downloading agent artifacts...")
|
||||
await download_agent_artifacts_into_folder(
|
||||
api_instance, task_id, config.temp_folder
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user