Make the information retrieval challenge a harder while still passing (#4468)

2026-02-10 08:44:27 +01:00 · 2023-05-30 15:56:58 -07:00
parent 86b6231f70
commit 87776b2886
6 changed files with 82 additions and 54 deletions
--- a/autogpt/main.py
+++ b/autogpt/main.py
@@ -22,6 +22,21 @@ from autogpt.utils import (
 from autogpt.workspace import Workspace
 from scripts.install_plugin_deps import install_plugin_dependencies

+COMMAND_CATEGORIES = [
+    "autogpt.commands.analyze_code",
+    "autogpt.commands.audio_text",
+    "autogpt.commands.execute_code",
+    "autogpt.commands.file_operations",
+    "autogpt.commands.git_operations",
+    "autogpt.commands.google_search",
+    "autogpt.commands.image_gen",
+    "autogpt.commands.improve_code",
+    "autogpt.commands.web_selenium",
+    "autogpt.commands.write_tests",
+    "autogpt.app",
+    "autogpt.commands.task_statuses",
+]
+

 def run_auto_gpt(
    continuous: bool,
@@ -128,30 +143,18 @@ def run_auto_gpt(
    # Create a CommandRegistry instance and scan default folder
    command_registry = CommandRegistry()

-    command_categories = [
-        "autogpt.commands.analyze_code",
-        "autogpt.commands.audio_text",
-        "autogpt.commands.execute_code",
-        "autogpt.commands.file_operations",
-        "autogpt.commands.git_operations",
-        "autogpt.commands.google_search",
-        "autogpt.commands.image_gen",
-        "autogpt.commands.improve_code",
-        "autogpt.commands.web_selenium",
-        "autogpt.commands.write_tests",
-        "autogpt.app",
-        "autogpt.commands.task_statuses",
-    ]
    logger.debug(
        f"The following command categories are disabled: {cfg.disabled_command_categories}"
    )
-    command_categories = [
-        x for x in command_categories if x not in cfg.disabled_command_categories
+    enabled_command_categories = [
+        x for x in COMMAND_CATEGORIES if x not in cfg.disabled_command_categories
    ]

-    logger.debug(f"The following command categories are enabled: {command_categories}")
+    logger.debug(
+        f"The following command categories are enabled: {enabled_command_categories}"
+    )

-    for command_category in command_categories:
+    for command_category in enabled_command_categories:
        command_registry.import_commands(command_category)

    ai_name = ""
--- a/docs/challenges/information_retrieval/challenge_a.md
+++ b/docs/challenges/information_retrieval/challenge_a.md
@@ -1,16 +1,19 @@
 # Information Retrieval Challenge A

-**Status**: Current level to beat: level 1
+**Status**: Current level to beat: level 2

 **Command to try**:

 ```
-pytest -s tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+pytest -s tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py --level=2
 ```

 ## Description

-The agent's goal is to find the revenue of Tesla in 2022.
+The agent's goal is to find the revenue of Tesla:
+- level 1 asks the revenue of Tesla in 2022 and explicitly asks to search for 'tesla revenue 2022'
+- level 2 is identical but doesn't ask to search for 'tesla revenue 2022'
+- level 3 asks for tesla's revenue by year since its creation.

 It should write the result in a file called output.txt.

--- a/tests/integration/agent_factory.py
+++ b/tests/integration/agent_factory.py
@@ -3,6 +3,7 @@ import pytest
 from autogpt.agent import Agent
 from autogpt.commands.command import CommandRegistry
 from autogpt.config import AIConfig, Config
+from autogpt.main import COMMAND_CATEGORIES
 from autogpt.memory.vector import NoMemory, get_memory
 from autogpt.prompts.prompt import DEFAULT_TRIGGERING_PROMPT
 from autogpt.workspace import Workspace
@@ -140,36 +141,46 @@ def memory_management_agent(agent_test_config, memory_json_file, workspace: Work


@pytest.fixture
-def get_company_revenue_agent(
+def information_retrieval_agents(
    agent_test_config, memory_json_file, workspace: Workspace
 ):
+    agents = []
    command_registry = CommandRegistry()
-    command_registry.import_commands("autogpt.commands.file_operations")
-    command_registry.import_commands("autogpt.commands.google_search")
-    command_registry.import_commands("autogpt.app")
-    command_registry.import_commands("autogpt.commands.task_statuses")
+    enabled_command_categories = [
+        x
+        for x in COMMAND_CATEGORIES
+        if x not in agent_test_config.disabled_command_categories
+    ]

-    ai_config = AIConfig(
-        ai_name="Information Retrieval Agent",
-        ai_role="an autonomous agent that specializes in retrieving information.",
-        ai_goals=[
-            "Search for 'tesla revenue 2022' and write the revenue of Tesla in 2022 to a file called output.txt. You should write the number without commas and you should not use signs like B for billion and M for million.",
-        ],
-    )
-    ai_config.command_registry = command_registry
-    system_prompt = ai_config.construct_full_prompt()
-    Config().set_continuous_mode(False)
-    agent = Agent(
-        ai_name="Get-CompanyRevenue",
-        memory=memory_json_file,
-        command_registry=command_registry,
-        config=ai_config,
-        next_action_count=0,
-        system_prompt=system_prompt,
-        triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
-        workspace_directory=workspace.root,
-    )
-    return agent
+    for command_category in enabled_command_categories:
+        command_registry.import_commands(command_category)
+    ai_goals = [
+        "Write to a file called output.txt tesla's revenue in 2022 after searching for 'tesla revenue 2022'.",
+        "Write to a file called output.txt tesla's revenue in 2022.",
+        "Write to a file called output.txt tesla's revenue every year since its creation.",
+    ]
+    for ai_goal in ai_goals:
+        ai_config = AIConfig(
+            ai_name="Information Retrieval Agent",
+            ai_role="an autonomous agent that specializes in retrieving information.",
+            ai_goals=[ai_goal],
+        )
+        ai_config.command_registry = command_registry
+        system_prompt = ai_config.construct_full_prompt()
+        Config().set_continuous_mode(False)
+        agents.append(
+            Agent(
+                ai_name="Information Retrieval Agent",
+                memory=memory_json_file,
+                command_registry=command_registry,
+                config=ai_config,
+                next_action_count=0,
+                system_prompt=system_prompt,
+                triggering_prompt=DEFAULT_TRIGGERING_PROMPT,
+                workspace_directory=workspace.root,
+            )
+        )
+    return agents


@pytest.fixture
--- a/tests/integration/challenges/challenge_decorator/challenge_decorator.py
+++ b/tests/integration/challenges/challenge_decorator/challenge_decorator.py
@@ -1,4 +1,3 @@
-import contextlib
 import os
 from functools import wraps
 from typing import Any, Callable, Optional
@@ -23,6 +22,7 @@ def challenge(func: Callable[..., Any]) -> Callable[..., None]:
    @wraps(func)
    def wrapper(*args: Any, **kwargs: Any) -> None:
        run_remaining = MAX_LEVEL_TO_IMPROVE_ON if Challenge.BEAT_CHALLENGES else 1
+        original_error = None

        while run_remaining > 0:
            current_score, new_score, new_score_location = get_scores()
@@ -32,9 +32,12 @@ def challenge(func: Callable[..., Any]) -> Callable[..., None]:
            )
            if challenge.level_to_run is not None:
                kwargs["level_to_run"] = challenge.level_to_run
-                with contextlib.suppress(AssertionError):
+                try:
                    func(*args, **kwargs)
                    challenge.succeeded = True
+                except AssertionError as err:
+                    original_error = err
+                    challenge.succeeded = False
            else:
                challenge.skipped = True
            if os.environ.get("CI") == "true":
@@ -51,6 +54,8 @@ def challenge(func: Callable[..., Any]) -> Callable[..., None]:
                if Challenge.BEAT_CHALLENGES or challenge.is_new_challenge:
                    # xfail
                    pytest.xfail("Challenge failed")
+                if original_error:
+                    raise original_error
                raise AssertionError("Challenge failed")
            run_remaining -= 1

--- a/tests/integration/challenges/current_score.json
+++ b/tests/integration/challenges/current_score.json
@@ -11,7 +11,7 @@
    },
    "information_retrieval": {
        "information_retrieval_challenge_a": {
-            "max_level": 1,
+            "max_level": 3,
            "max_level_beaten": 1
        },
        "information_retrieval_challenge_b": {
--- a/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+++ b/tests/integration/challenges/information_retrieval/test_information_retrieval_challenge_a.py
@@ -9,6 +9,7 @@ from tests.integration.challenges.utils import run_interaction_loop
 from tests.utils import requires_api_key

 CYCLE_COUNT = 3
+EXPECTED_REVENUES = [["81"], ["81"], ["81", "53", "24", "21", "11", "7", "4", "3", "2"]]
 from autogpt.agent import Agent


@@ -16,7 +17,7 @@ from autogpt.agent import Agent
@requires_api_key("OPENAI_API_KEY")
@challenge
 def test_information_retrieval_challenge_a(
-    get_company_revenue_agent: Agent,
+    information_retrieval_agents: Agent,
    monkeypatch: pytest.MonkeyPatch,
    patched_api_requestor: None,
    config: Config,
@@ -28,8 +29,13 @@ def test_information_retrieval_challenge_a(
    :param get_company_revenue_agent: The agent to test.
    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
    """
-    run_interaction_loop(monkeypatch, get_company_revenue_agent, CYCLE_COUNT)
+    information_retrieval_agent = information_retrieval_agents[level_to_run - 1]
+    run_interaction_loop(monkeypatch, information_retrieval_agent, CYCLE_COUNT)

-    file_path = str(get_company_revenue_agent.workspace.get_path("output.txt"))
+    file_path = str(information_retrieval_agent.workspace.get_path("output.txt"))
    content = read_file(file_path, config)
-    assert "81" in content, "Expected the file to contain 81"
+    expected_revenues = EXPECTED_REVENUES[level_to_run - 1]
+    for revenue in expected_revenues:
+        assert (
+            f"{revenue}." in content or f"{revenue}," in content
+        ), f"Expected the file to contain {revenue}"