diff --git a/autogpts/autogpt/agbenchmark_config/config.json b/autogpts/autogpt/agbenchmark_config/config.json index d3762ac0..995574c2 100644 --- a/autogpts/autogpt/agbenchmark_config/config.json +++ b/autogpts/autogpt/agbenchmark_config/config.json @@ -1 +1 @@ -{"workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks"} \ No newline at end of file +{"workspace": {"input": "auto_gpt_workspace", "output":"auto_gpt_workspace" }, "entry_path": "agbenchmark.benchmarks"} diff --git a/autogpts/autogpt/challenges_already_beaten.json b/autogpts/autogpt/challenges_already_beaten.json new file mode 100644 index 00000000..7bdab6f2 --- /dev/null +++ b/autogpts/autogpt/challenges_already_beaten.json @@ -0,0 +1,3 @@ +{ + "TestWriteFile": true +} \ No newline at end of file diff --git a/benchmark/agbenchmark/agent_interface.py b/benchmark/agbenchmark/agent_interface.py index 7d5f3258..c59f582b 100644 --- a/benchmark/agbenchmark/agent_interface.py +++ b/benchmark/agbenchmark/agent_interface.py @@ -75,19 +75,15 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None: def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None: - """Calling to get a response""" + print(f"Running agbenchmark/benchmarks.py with timeout {timeout}") - entry_path = agent_config.get_agent_entry_path() - print(f"Running '{entry_path}' with timeout {timeout}") - - command = [sys.executable, entry_path, str(task)] + command = [sys.executable, "-m", "agbenchmark_config.benchmarks", str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, - cwd=agent_config.get_agent_directory(), bufsize=1, ) diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/agbenchmark/conftest.py index 8c10256a..21a58466 100644 --- a/benchmark/agbenchmark/conftest.py +++ b/benchmark/agbenchmark/conftest.py @@ -53,7 +53,7 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig: raise -def resolve_workspace(workspace: Path) -> Path: +def resolve_workspace_path(workspace: Path) -> Path: """ This function resolves the workspace path. @@ -83,10 +83,10 @@ def resolve_workspace(workspace: Path) -> Path: return path_value else: raise ValueError("Invalid workspace path expression.") - elif isinstance(workspace, Path): - return os.path.abspath(workspace) + elif isinstance(workspace, str): + return os.path.abspath(Path.cwd() / workspace) else: - raise ValueError("Invalid workspace type. Expected str or Path.") + raise ValueError("Invalid workspace type. Expected str") @pytest.fixture(scope="module") @@ -119,15 +119,12 @@ def config(request: Any) -> Any: config["AgentBenchmarkConfig"] = agent_benchmark_config - if isinstance(config["workspace"], str): - config["workspace"] = resolve_workspace(agent_benchmark_config.workspace) - else: # it's a input output dict - config["workspace"]["input"] = resolve_workspace( - agent_benchmark_config.workspace / "input" - ) - config["workspace"]["output"] = resolve_workspace( - agent_benchmark_config.workspace / "output" - ) + config["workspace"]["input"] = resolve_workspace_path( + agent_benchmark_config.workspace.input + ) + config["workspace"]["output"] = resolve_workspace_path( + agent_benchmark_config.workspace.output + ) return config diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py index c17cb5a5..5421c0a1 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/agbenchmark/utils/data_types.py @@ -17,7 +17,9 @@ class DifficultyLevel(Enum): expert = "expert" human = "human" - +class Workspace(BaseModel): + input: str + output: str # map from enum to difficulty level (numeric) DIFFICULTY_MAP = { DifficultyLevel.interface: 1, @@ -85,7 +87,7 @@ class AgentBenchmarkConfig(BaseModel): agent_benchmark_config_path: Path | None = None entry_path: str - workspace: Path + workspace: Workspace reports_folder: Path | None = None api_mode: bool = False host: str | None