Integrate Auto-GPT with Auto-GPT-Benchmarks (#4987)

* WIP Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> * WIP Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> * Update config for benchmark changes (#4883) * Add Helicone * Add reports, consolidate, update benchmark files (#4941) * updating config * add reports, consolidate, update benchmark files * Update benchmarks.py * Change entrypath and add __init__.py * Remove Helicone integration because we now have proxy at the system level * Support more regression tests * Fix Auto-GPT/benchmark integration Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> * Remove cutoff * Install agbenchmark and make continuous mode dynamic Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> --------- Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com> Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
2026-02-04 13:54:31 +01:00 · 2023-07-22 14:58:46 -07:00
parent e0bcde178e
commit 4ada7d1784
6 changed files with 50 additions and 9 deletions
--- a/agbenchmark/init.py
+++ b/agbenchmark/init.py
--- a/agbenchmark/benchmarks.py
+++ b/agbenchmark/benchmarks.py
@@ -1,4 +1,6 @@
+import sys
 from pathlib import Path
+from typing import Tuple

 from autogpt.agents import Agent
 from autogpt.app.main import run_interaction_loop
@@ -12,14 +14,15 @@ from autogpt.workspace import Workspace
 PROJECT_DIR = Path().resolve()


-def run_task(task) -> None:
-    agent = bootstrap_agent(task)
+def run_specific_agent(task, continuous_mode=False) -> Tuple[str, int]:
+    agent = bootstrap_agent(task, continuous_mode)
    run_interaction_loop(agent)


-def bootstrap_agent(task):
+def bootstrap_agent(task, continuous_mode) -> Agent:
    config = ConfigBuilder.build_config_from_env(workdir=PROJECT_DIR)
-    config.continuous_mode = False
+    config.debug_mode = True
+    config.continuous_mode = continuous_mode
    config.temperature = 0
    config.plain_output = True
    command_registry = get_command_registry(config)
@@ -29,7 +32,7 @@ def bootstrap_agent(task):
    ai_config = AIConfig(
        ai_name="Auto-GPT",
        ai_role="a multi-purpose AI assistant.",
-        ai_goals=[task.user_input],
+        ai_goals=[task],
    )
    ai_config.command_registry = command_registry
    return Agent(
@@ -50,3 +53,12 @@ def get_command_registry(config: Config):
    for command_category in enabled_command_categories:
        command_registry.import_commands(command_category)
    return command_registry
+
+
+if __name__ == "__main__":
+    # The first argument is the script name itself, second is the task
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <task>")
+        sys.exit(1)
+    task = sys.argv[1]
+    run_specific_agent(task, continuous_mode=True)
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -0,0 +1,4 @@
+{
+  "workspace": "autogpt/workspace/auto_gpt_workspace",
+  "entry_path": "agbenchmark.benchmarks"
+}
--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -0,0 +1,24 @@
+{
+    "TestBasicCodeGeneration": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/code/d3"
+    },
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "data_path": "agbenchmark/challenges/memory/m1"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "data_path": "agbenchmark/challenges/interface/read_file"
+    },
+    "TestWriteFile": {
+        "dependencies": [],
+        "data_path": "agbenchmark/challenges/interface/write_file"
+    }
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 beautifulsoup4>=4.12.2
 colorama==0.4.6
 distro==1.8.0
-openai==0.27.2
+openai==0.27.8
 playsound==1.2.2
-python-dotenv==1.0.0
+python-dotenv==0.21
 pyyaml==6.0
 PyPDF2
 python-docx
@@ -31,6 +31,7 @@ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_
 prompt_toolkit>=3.0.38
 pydantic
 inflection
+agbenchmark

 # web server
 fastapi
--- a/tests/challenges/utils.py
+++ b/tests/challenges/utils.py
@@ -6,9 +6,9 @@ from typing import Any, Generator

 import pytest

+from agbenchmark.benchmarks import run_specific_agent
 from autogpt.logs import LogCycleHandler
 from autogpt.workspace import Workspace
-from benchmarks import run_task
 from tests.challenges.schema import Task


@@ -75,4 +75,4 @@ def run_challenge(
    setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
    task = Task(user_input=user_input)
    with contextlib.suppress(SystemExit):
-        run_task(task)
+        run_specific_agent(task.user_input)