refactor(benchmark): Interface & type consoledation, and arch change, to allow adding challenge providers

Squashed commit of the following: commit 7d6476d3297860f74c276d571da995d958a8cc1a Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 18:10:45 2024 +0100 refactor(benchmark/challenge): Set up structure to support more challenge providers - Move `Challenge`, `ChallengeData`, `load_challenges` to `challenges/builtin.py` and rename to `BuiltinChallenge`, `BuiltinChallengeSpec`, `load_builtin_challenges` - Create `BaseChallenge` to serve as interface and base class for different challenge implementations - Create `ChallengeInfo` model to serve as universal challenge info object - Create `get_challenge_from_source_uri` function in `challenges/__init__.py` - Replace `ChallengeData` by `ChallengeInfo` everywhere except in `BuiltinChallenge` - Add strong typing to `task_informations` store in app.py - Use `call.duration` in `finalize_test_report` and remove `timer` fixture - Update docstring on `challenges/__init__.py:get_unique_categories` - Add docstring to `generate_test.py` commit 5df2aa7939b45d85a2c2b5de9ac0522330d1502a Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 16:58:01 2024 +0100 refactor(benchmark): Refactor & rename functions in agent_interface.py and agent_api_interface.py - `copy_artifacts_into_temp_folder` -> `copy_challenge_artifacts_into_workspace` - `copy_agent_artifacts_into_folder` -> `download_agent_artifacts_into_folder` - Reorder parameters of `run_api_agent`, `copy_challenge_artifacts_into_workspace`; use `Path` instead of `str` commit 6a256fef4c7950b7ee82fb801e70c83afe6b6f8b Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 16:02:25 2024 +0100 refactor(benchmark): Refactor & typefix report generation and handling logic - Rename functions in reports.py and ReportManager.py to better reflect what they do - `get_previous_test_results` -> `get_and_update_success_history` - `generate_single_call_report` -> `initialize_test_report` - `finalize_reports` -> `finalize_test_report` - `ReportManager.end_info_report` -> `SessionReportManager.finalize_session_report` - Modify `pytest_runtest_makereport` hook in conftest.py to finalize the report immediately after the challenge finishes running instead of after teardown - Move result processing logic from `initialize_test_report` to `finalize_test_report` in reports.py - Use `Test` and `Report` types from report_types.py where possible instead of untyped dicts: reports.py, utils.py, ReportManager.py - Differentiate `ReportManager` into `SessionReportManager`, `RegressionTestsTracker`, `SuccessRateTracker` - Move filtering of optional challenge categories from challenge.py (`Challenge.skip_optional_categories`) to conftest.py (`pytest_collection_modifyitems`) - Remove unused `scores` fixture in conftest.py commit 370d6dbf5df75d78e3878877968e8cd309d6d7fb Author: Reinier van der Leer <pwuts@agpt.co> Date: Tue Jan 9 15:16:43 2024 +0100 refactor(benchmark): Simplify models in report_types.py - Removed ForbidOptionalMeta and BaseModelBenchmark classes. - Changed model attributes to optional: `Metrics.difficulty`, `Metrics.success`, `Metrics.success_percentage`, `Metrics.run_time`, and `Test.reached_cutoff`. - Added validator to `Metrics` model to require `success` and `run_time` fields if `attempted=True`. - Added default values to all optional model fields. - Removed duplicate imports. - Added condition in process_report.py to prevent null lookups if `metrics.difficulty` is not set.
2026-01-31 11:54:30 +01:00 · 2024-01-18 15:19:06 +01:00
parent f2595af362
commit 9012ff4db2
16 changed files with 921 additions and 812 deletions
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -2,27 +2,32 @@ import logging
 import os
 import time
 from pathlib import Path
-from typing import Optional
+from typing import AsyncIterator, Optional

-from agent_protocol_client import AgentApi, ApiClient, Configuration, TaskRequestBody
+from agent_protocol_client import (
+    AgentApi,
+    ApiClient,
+    Configuration,
+    Step,
+    TaskRequestBody,
+)

 from agbenchmark.agent_interface import get_list_of_file_paths
 from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import ChallengeData

-LOG = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)


 async def run_api_agent(
-    task: ChallengeData,
+    task: str,
    config: AgentBenchmarkConfig,
-    artifacts_location: str,
    timeout: int,
-) -> None:
+    artifacts_location: Optional[Path] = None,
+) -> AsyncIterator[Step]:
    configuration = Configuration(host=config.host)
    async with ApiClient(configuration) as api_client:
        api_instance = AgentApi(api_client)
-        task_request_body = TaskRequestBody(input=task.task)
+        task_request_body = TaskRequestBody(input=task)

        start_time = time.time()
        response = await api_instance.create_agent_task(
@@ -30,37 +35,33 @@ async def run_api_agent(
        )
        task_id = response.task_id

-        await upload_artifacts(
-            api_instance, artifacts_location, task_id, "artifacts_in"
-        )
-
-        i = 1
-        steps_remaining = True
-        while steps_remaining:
-            # Read the existing JSON data from the file
+        if artifacts_location:
+            await upload_artifacts(
+                api_instance, artifacts_location, task_id, "artifacts_in"
+            )

+        while True:
            step = await api_instance.execute_agent_task_step(task_id=task_id)
-
-            print(f"[{task.name}] - step {step.name} ({i}. request)")
-            i += 1
+            yield step

            if time.time() - start_time > timeout:
                raise TimeoutError("Time limit exceeded")
            if not step or step.is_last:
-                steps_remaining = False
+                break

-        # In "mock" mode, we cheat by giving the correct artifacts to pass the challenge
-        if os.getenv("IS_MOCK"):
-            await upload_artifacts(
-                api_instance, artifacts_location, task_id, "artifacts_out"
+        if artifacts_location:
+            # In "mock" mode, we cheat by giving the correct artifacts to pass the test
+            if os.getenv("IS_MOCK"):
+                await upload_artifacts(
+                    api_instance, artifacts_location, task_id, "artifacts_out"
+                )
+
+            await download_agent_artifacts_into_folder(
+                api_instance, task_id, config.temp_folder
            )

-        await copy_agent_artifacts_into_folder(
-            api_instance, task_id, config.temp_folder
-        )

-
-async def copy_agent_artifacts_into_folder(
+async def download_agent_artifacts_into_folder(
    api_instance: AgentApi, task_id: str, folder: Path
 ):
    artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
@@ -76,11 +77,10 @@ async def copy_agent_artifacts_into_folder(
            folder = (folder / path).parent

        if not folder.exists():
-            LOG.info(f"Creating directory {folder}")
            folder.mkdir(parents=True)

        file_path = folder / artifact.file_name
-        LOG.info(f"Writing file {file_path}")
+        logger.debug(f"Downloading agent artifact {artifact.file_name} to {folder}")
        with open(file_path, "wb") as f:
            content = await api_instance.download_agent_task_artifact(
                task_id=task_id, artifact_id=artifact.artifact_id
@@ -90,7 +90,7 @@ async def copy_agent_artifacts_into_folder(


 async def upload_artifacts(
-    api_instance: AgentApi, artifacts_location: str, task_id: str, type: str
+    api_instance: AgentApi, artifacts_location: Path, task_id: str, type: str
 ) -> None:
    for file_path in get_list_of_file_paths(artifacts_location, type):
        relative_path: Optional[str] = "/".join(
--- a/benchmark/agbenchmark/agent_interface.py
+++ b/benchmark/agbenchmark/agent_interface.py
@@ -18,8 +18,8 @@ def get_list_of_file_paths(
    return list(source_dir.iterdir())


-def copy_artifacts_into_temp_folder(
-    workspace: str | Path, artifact_folder_name: str, challenge_dir_path: str | Path
+def copy_challenge_artifacts_into_workspace(
+    challenge_dir_path: str | Path, artifact_folder_name: str, workspace: str | Path
 ) -> None:
    file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
    for file_path in file_paths:
--- a/benchmark/agbenchmark/app.py
+++ b/benchmark/agbenchmark/app.py
@@ -5,10 +5,10 @@ import logging
 import sys
 import time
 import uuid
-from collections import defaultdict, deque
+from collections import deque
 from multiprocessing import Process
 from pathlib import Path
-from typing import Any, Optional
+from typing import Optional

 import httpx
 import psutil
@@ -18,6 +18,7 @@ from fastapi import APIRouter, FastAPI, HTTPException, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Extra, ValidationError

+from agbenchmark.challenges import ChallengeInfo
 from agbenchmark.config import AgentBenchmarkConfig
 from agbenchmark.reports.processing.report_types_v2 import (
    BenchmarkRun,
@@ -27,14 +28,13 @@ from agbenchmark.reports.processing.report_types_v2 import (
    TaskInfo,
 )
 from agbenchmark.schema import TaskEvalRequestBody
-from agbenchmark.utils.data_types import ChallengeData
 from agbenchmark.utils.utils import write_pretty_json

 sys.path.append(str(Path(__file__).parent.parent))

 logger = logging.getLogger(__name__)

-CHALLENGES: dict[str, ChallengeData] = {}
+CHALLENGES: dict[str, ChallengeInfo] = {}
 challenges_path = Path(__file__).parent / "challenges"
 challenge_spec_files = deque(
    glob.glob(
@@ -52,7 +52,7 @@ while challenge_spec_files:

    logger.debug(f"Loading {challenge_relpath}...")
    try:
-        challenge_info = ChallengeData.parse_file(challenge_spec_file)
+        challenge_info = ChallengeInfo.parse_file(challenge_spec_file)
    except ValidationError as e:
        if logging.getLogger().level == logging.DEBUG:
            logger.warning(f"Spec file {challenge_relpath} failed to load:\n{e}")
@@ -68,7 +68,14 @@ while challenge_spec_files:

    CHALLENGES[challenge_info.eval_id] = challenge_info

-task_informations = defaultdict(dict[str, Any])
+
+class BenchmarkTaskInfo(BaseModel):
+    task_id: str
+    start_time: datetime.datetime
+    challenge_info: ChallengeInfo
+
+
+task_informations: dict[str, BenchmarkTaskInfo] = {}


 def find_agbenchmark_without_uvicorn():
@@ -124,12 +131,8 @@ def stream_output(pipe):


 def setup_fastapi_app(agbenchmark_config: AgentBenchmarkConfig) -> FastAPI:
-    from agbenchmark.agent_api_interface import (
-        copy_agent_artifacts_into_folder,
-        upload_artifacts,
-    )
-    from agbenchmark.agent_interface import copy_artifacts_into_temp_folder
-    from agbenchmark.generate_test import create_challenge_from_spec_file
+    from agbenchmark.agent_api_interface import upload_artifacts
+    from agbenchmark.challenges import get_challenge_from_source_uri
    from agbenchmark.main import run_benchmark

    configuration = Configuration(
@@ -231,28 +234,29 @@ def setup_fastapi_app(agbenchmark_config: AgentBenchmarkConfig) -> FastAPI:
                }
        """
        try:
+            challenge_info = CHALLENGES[task_eval_request.eval_id]
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
-                task_input = CHALLENGES[task_eval_request.eval_id].task
+                task_input = challenge_info.task

                task_request_body = TaskRequestBody(input=task_input)
                task_response = await api_instance.create_agent_task(
                    task_request_body=task_request_body
                )
-                task_informations[task_response.task_id][
-                    "benchmark_start_time"
-                ] = datetime.datetime.now(datetime.timezone.utc).strftime(
-                    "%Y-%m-%dT%H:%M:%S+00:00"
-                )
-                task_informations[task_response.task_id][
-                    "eval_id"
-                ] = task_eval_request.eval_id
-                await upload_artifacts(
-                    api_instance,
-                    str(CHALLENGES[task_eval_request.eval_id].spec_file.parent),
-                    task_response.task_id,
-                    "artifacts_in",
+                task_info = BenchmarkTaskInfo(
+                    task_id=task_response.task_id,
+                    start_time=datetime.datetime.now(datetime.timezone.utc),
+                    challenge_info=challenge_info,
                )
+                task_informations[task_info.task_id] = task_info
+
+                if input_artifacts_dir := challenge_info.task_artifacts_dir:
+                    await upload_artifacts(
+                        api_instance,
+                        input_artifacts_dir,
+                        task_response.task_id,
+                        "artifacts_in",
+                    )
                return task_response
        except ApiException as e:
            logger.error(f"Error whilst trying to create a task:\n{e}")
@@ -281,41 +285,39 @@ def setup_fastapi_app(agbenchmark_config: AgentBenchmarkConfig) -> FastAPI:

    @router.post("/agent/tasks/{task_id}/evaluations")
    async def create_evaluation(task_id: str) -> BenchmarkRun:
-        challenge_info = CHALLENGES[task_informations[task_id]["eval_id"]]
-        workspace = agbenchmark_config.temp_folder
+        task_info = task_informations[task_id]
+        challenge = get_challenge_from_source_uri(task_info.challenge_info.source_uri)
        try:
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
-                await copy_agent_artifacts_into_folder(api_instance, task_id, workspace)
-
-            artifact_path = challenge_info.spec_file.parent
-            copy_artifacts_into_temp_folder(workspace, "custom_python", artifact_path)
-
-            challenge = create_challenge_from_spec_file(challenge_info.spec_file)
-            scores = challenge.get_scores(workspace)
-            is_score_100 = 1 in scores["values"]
+                eval_results = await challenge.evaluate_task_state(
+                    api_instance, task_id
+                )

            eval_info = BenchmarkRun(
                repository_info=RepositoryInfo(),
                run_details=RunDetails(
-                    command=f"agbenchmark --test={challenge_info.name}",
+                    command=f"agbenchmark --test={challenge.info.name}",
                    benchmark_start_time=(
-                        task_informations[task_id]["benchmark_start_time"]
+                        task_info.start_time.strftime("%Y-%m-%dT%H:%M:%S+00:00")
                    ),
-                    test_name=challenge_info.name,
+                    test_name=challenge.info.name,
                ),
                task_info=TaskInfo(
-                    data_path=str(
-                        challenge_info.spec_file.relative_to(challenges_path.parent)
-                    ),
+                    data_path=challenge.info.source_uri,
                    is_regression=None,
-                    category=[c.value for c in challenge_info.category],
-                    task=challenge_info.task,
-                    answer=challenge_info.ground.answer,
-                    description=challenge_info.info.description,
+                    category=[c.value for c in challenge.info.category],
+                    task=challenge.info.task,
+                    answer=challenge.info.reference_answer or "",
+                    description=challenge.info.description or "",
                ),
                metrics=Metrics(
-                    success=is_score_100,
+                    success=all(e.passed for e in eval_results),
+                    success_percentage=(
+                        100 * sum(e.score for e in eval_results) / len(eval_results)
+                        if eval_results  # avoid division by 0
+                        else 0
+                    ),
                    attempted=True,
                ),
                config={},
--- a/benchmark/agbenchmark/challenges/init.py
+++ b/benchmark/agbenchmark/challenges/init.py
@@ -3,14 +3,26 @@ import json
 import logging
 from pathlib import Path

+from .base import BaseChallenge, ChallengeInfo
+from .builtin import OPTIONAL_CATEGORIES
+
 logger = logging.getLogger(__name__)


+def get_challenge_from_source_uri(source_uri: str) -> type[BaseChallenge]:
+    from .builtin import BuiltinChallenge
+
+    provider_prefix = source_uri.split("/", 1)[0]
+
+    if provider_prefix == BuiltinChallenge.SOURCE_URI_PREFIX:
+        return BuiltinChallenge.from_source_uri(source_uri)
+
+    raise ValueError(f"Cannot resolve source_uri '{source_uri}'")
+
+
 def get_unique_categories() -> set[str]:
    """
-    Find all data.json files in the directory relative to this file and its
-    subdirectories, read the "category" field from each file, and return a set of unique
-    categories.
+    Reads all challenge spec files and returns a set of all their categories.
    """
    categories = set()

@@ -30,3 +42,11 @@ def get_unique_categories() -> set[str]:
                continue

    return categories
+
+
+__all__ = [
+    "BaseChallenge",
+    "ChallengeInfo",
+    "get_unique_categories",
+    "OPTIONAL_CATEGORIES",
+]
--- a/benchmark/agbenchmark/challenges/base.py
+++ b/benchmark/agbenchmark/challenges/base.py
@@ -0,0 +1,99 @@
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import AsyncIterator, ClassVar, Optional
+
+import pytest
+from agent_protocol_client import AgentApi, Step
+from colorama import Fore, Style
+from pydantic import BaseModel, Field
+
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
+
+logger = logging.getLogger(__name__)
+
+
+class ChallengeInfo(BaseModel):
+    eval_id: str = ""
+    name: str
+    task: str
+    task_artifacts_dir: Optional[Path] = None
+    category: list[Category]
+    difficulty: Optional[DifficultyLevel] = None
+    description: Optional[str] = None
+    dependencies: list[str] = Field(default_factory=list)
+    reference_answer: Optional[str]
+
+    source_uri: str
+    """Internal reference indicating the source of the challenge specification"""
+
+
+class BaseChallenge(ABC):
+    """
+    The base class and shared interface for all specific challenge implementations.
+    """
+
+    info: ClassVar[ChallengeInfo]
+
+    @classmethod
+    @abstractmethod
+    def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:
+        """
+        Construct an individual challenge subclass from a suitable `source_uri` (as in
+        `ChallengeInfo.source_uri`).
+        """
+        ...
+
+    @abstractmethod
+    def test_method(
+        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+    ) -> None:
+        """
+        Test method for use by Pytest-based benchmark sessions. Should return normally
+        if the challenge passes, and raise a (preferably descriptive) error otherwise.
+        """
+        ...
+
+    @classmethod
+    async def run_challenge(
+        cls, config: AgentBenchmarkConfig, timeout: int
+    ) -> AsyncIterator[Step]:
+        """
+        Runs the challenge on the subject agent with the specified timeout.
+        Also prints basic challenge and status info to STDOUT.
+
+        Params:
+            config: The subject agent's benchmark config.
+            timeout: Timeout (seconds) after which to stop the run if not finished.
+
+        Yields:
+            Step: The steps generated by the agent for the challenge task.
+        """
+        # avoid circular import
+        from agbenchmark.agent_api_interface import run_api_agent
+
+        print()
+        print(
+            f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} "
+            f"Starting {cls.info.name} challenge"
+            f" {'='*24}{Style.RESET_ALL}"
+        )
+        print(f"{Fore.CYAN}Timeout:{Fore.RESET} {timeout} seconds")
+        print(f"{Fore.CYAN}Task:{Fore.RESET} {cls.info.task}")
+
+        print()
+        logger.debug(f"Starting {cls.info.name} challenge run")
+        i = 0
+        async for step in run_api_agent(cls.info.task, config, timeout):
+            i += 1
+            print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
+            yield step
+        logger.debug(f"Finished {cls.info.name} challenge run")
+
+    @classmethod
+    @abstractmethod
+    async def evaluate_task_state(
+        cls, agent: AgentApi, task_id: str
+    ) -> list[EvalResult]:
+        ...
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -0,0 +1,422 @@
+from collections import deque
+import glob
+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any, ClassVar, Iterator, Literal, Optional
+
+import pytest
+from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
+from colorama import Fore, Style
+from openai import _load_client as get_openai_client
+from pydantic import BaseModel, constr, Field, validator
+
+from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
+from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
+from agbenchmark.utils.prompts import (
+    END_PROMPT,
+    FEW_SHOT_EXAMPLES,
+    PROMPT_MAP,
+    SCORING_MAP,
+)
+
+from .base import BaseChallenge, ChallengeInfo
+
+logger = logging.getLogger(__name__)
+
+with open(Path(__file__).parent / "optional_categories.json") as f:
+    OPTIONAL_CATEGORIES: list[str] = json.load(f)["optional_categories"]
+
+
+class BuiltinChallengeSpec(BaseModel):
+    eval_id: str = ""
+    name: str
+    task: str
+    category: list[Category]
+    dependencies: list[str]
+    cutoff: int
+
+    class Info(BaseModel):
+        difficulty: DifficultyLevel
+        description: constr(regex=r"^Tests if the agent can.*")
+        side_effects: list[str] = Field(default_factory=list)
+
+    info: Info
+
+    class Ground(BaseModel):
+        answer: str
+        should_contain: Optional[list[str]] = None
+        should_not_contain: Optional[list[str]] = None
+        files: list[str]
+        case_sensitive: Optional[bool] = True
+
+        class Eval(BaseModel):
+            type: str
+            scoring: Optional[Literal["percentage", "scale", "binary"]]
+            template: Optional[Literal["rubric", "reference", "question", "custom"]]
+            examples: Optional[str]
+
+            @validator("scoring", "template", always=True)
+            def validate_eval_fields(cls, v, values, field):
+                if "type" in values and values["type"] == "llm":
+                    if v is None:
+                        raise ValueError(
+                            f"{field.name} must be provided when eval type is 'llm'"
+                        )
+                else:
+                    if v is not None:
+                        raise ValueError(
+                            f"{field.name} should only exist when eval type is 'llm'"
+                        )
+                return v
+
+        eval: Eval
+
+    ground: Ground
+
+    metadata: Optional[dict[str, Any]] = None
+    spec_file: Path | None = Field(None, exclude=True)
+
+
+class BuiltinChallenge(BaseChallenge):
+    """
+    Base class for AGBenchmark's built-in challenges (challenges/**/*.json).
+
+    All of the logic is present in this class. Individual challenges are created as
+    subclasses of `BuiltinChallenge` with challenge-specific values assigned to the
+    ClassVars `_spec` etc.
+
+    Dynamically constructing subclasses rather than class instances for the individual
+    challenges makes them suitable for collection by Pytest, which will run their
+    `test_method` like any regular test item.
+    """
+
+    _spec: ClassVar[BuiltinChallengeSpec]
+    CHALLENGE_LOCATION: ClassVar[str]
+    ARTIFACTS_LOCATION: ClassVar[str]
+
+    SOURCE_URI_PREFIX = "__BUILTIN__"
+
+    @classmethod
+    def from_challenge_spec(
+        cls, spec: BuiltinChallengeSpec
+    ) -> type["BuiltinChallenge"]:
+        if not spec.spec_file:
+            raise ValueError("spec.spec_file not defined")
+
+        challenge_info = ChallengeInfo(
+            eval_id=spec.eval_id,
+            name=spec.name,
+            task=spec.task,
+            task_artifacts_dir=spec.spec_file.parent,
+            category=spec.category,
+            difficulty=spec.info.difficulty,
+            description=spec.info.description,
+            dependencies=spec.dependencies,
+            reference_answer=spec.ground.answer,
+            source_uri=(
+                f"__BUILTIN__/{spec.spec_file.relative_to(Path(__file__).parent)}"
+            ),
+        )
+
+        challenge_class_name = f"Test{challenge_info.name}"
+        logger.debug(f"Creating {challenge_class_name} from spec: {spec.spec_file}")
+        return type(
+            challenge_class_name,
+            (BuiltinChallenge,),
+            {
+                "info": challenge_info,
+                "_spec": spec,
+                "CHALLENGE_LOCATION": str(spec.spec_file),
+                "ARTIFACTS_LOCATION": str(spec.spec_file.resolve().parent),
+            },
+        )
+
+    @classmethod
+    def from_challenge_spec_file(cls, spec_file: Path) -> type["BuiltinChallenge"]:
+        challenge_spec = BuiltinChallengeSpec.parse_file(spec_file)
+        challenge_spec.spec_file = spec_file
+        return cls.from_challenge_spec(challenge_spec)
+
+    @classmethod
+    def from_source_uri(cls, source_uri: str) -> type["BuiltinChallenge"]:
+        if not source_uri.startswith(cls.SOURCE_URI_PREFIX):
+            raise ValueError(f"Invalid source_uri for BuiltinChallenge: {source_uri}")
+
+        path = source_uri.split("/", 1)[1]
+        spec_file = Path(__file__).parent / path
+        return cls.from_challenge_spec_file(spec_file)
+
+    @pytest.mark.asyncio
+    async def test_method(
+        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+    ) -> None:
+        if os.environ.get("HELICONE_API_KEY"):
+            from helicone.lock import HeliconeLockManager
+
+            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+
+        timeout = self._spec.cutoff or 60
+
+        if request.config.getoption("--nc"):
+            timeout = 100000
+        elif cutoff := request.config.getoption("--cutoff"):
+            timeout = int(cutoff)  # type: ignore
+
+        task_id = ""
+        timed_out = None
+        try:
+            async for step in self.run_challenge(config, timeout):
+                if not task_id:
+                    task_id = step.task_id
+                if request.config.getoption("--mock"):
+                    # Run only one step in mock mode
+                    break
+            timed_out = False
+        except TimeoutError:
+            timed_out = True
+        request.node.user_properties.append(("timed_out", timed_out))
+
+        agent_client_config = ClientConfig(host=config.host)
+        async with ApiClient(agent_client_config) as api_client:
+            api_instance = AgentApi(api_client)
+            eval_results = await self.evaluate_task_state(api_instance, task_id)
+
+        if not eval_results:
+            if timed_out:
+                raise TimeoutError("Timed out, no results to evaluate")
+            else:
+                raise ValueError("No results to evaluate")
+
+        request.node.user_properties.append(
+            (
+                "answers",
+                [r.result for r in eval_results]
+                if request.config.getoption("--keep-answers")
+                else None,
+            )
+        )
+        request.node.user_properties.append(("scores", [r.score for r in eval_results]))
+
+        # FIXME: this allows partial failure
+        assert any(r.passed for r in eval_results), (
+            f"No passed evals: {eval_results}"
+            if not timed_out
+            else f"Timed out; no passed evals: {eval_results}"
+        )
+
+    @classmethod
+    async def evaluate_task_state(
+        cls, agent: AgentApi, task_id: str
+    ) -> list[EvalResult]:
+        with tempfile.TemporaryDirectory() as workspace:
+            workspace = Path(workspace)
+            await download_agent_artifacts_into_folder(agent, task_id, workspace)
+            if cls.info.task_artifacts_dir:
+                copy_challenge_artifacts_into_workspace(
+                    cls.info.task_artifacts_dir, "custom_python", workspace
+                )
+
+            return list(cls.evaluate_workspace_content(workspace))
+
+    @classmethod
+    def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
+        if cls._spec.task == "" and os.getenv("IS_MOCK"):
+            yield EvalResult(
+                result="This is a mock answer",
+                result_source="step_output",
+                score=1.0,
+                passed=True,
+            )
+            return
+
+        result_ground = cls._spec.ground
+        outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
+
+        if result_ground.should_contain or result_ground.should_not_contain:
+            for source, content in outputs_for_eval:
+                score = cls.score_result(content, result_ground)
+                if score is not None:
+                    print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", score)
+                    yield EvalResult(
+                        result=content,
+                        result_source=str(source),
+                        score=score,
+                        passed=score > 0.9,  # FIXME: arbitrary threshold
+                    )
+
+        if result_ground.eval.type == "llm":
+            combined_results = "\n".join(output[1] for output in outputs_for_eval)
+            llm_eval = cls.score_result_with_llm(combined_results, result_ground)
+            print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", llm_eval)
+            if result_ground.eval.scoring == "percentage":
+                score = llm_eval / 100
+            elif result_ground.eval.scoring == "scale":
+                score = llm_eval / 10
+            else:
+                score = llm_eval
+
+            yield EvalResult(
+                result=combined_results,
+                result_source=", ".join(str(res[0]) for res in outputs_for_eval),
+                score=score,
+                passed=score > 0.9,  # FIXME: arbitrary threshold
+            )
+
+    @staticmethod
+    def get_outputs_for_eval(
+        workspace: str | Path | dict[str, str], ground: BuiltinChallengeSpec.Ground
+    ) -> Iterator[tuple[str | Path, str]]:
+        if isinstance(workspace, dict):
+            workspace = workspace["output"]
+
+        script_dir = workspace
+
+        for file_pattern in ground.files:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                if ground.eval.type == "python":
+                    result = subprocess.run(
+                        [sys.executable, file_path],
+                        cwd=os.path.abspath(workspace),
+                        capture_output=True,
+                        text=True,
+                    )
+                    if "error" in result.stderr or result.returncode != 0:
+                        print(result.stderr)
+                        assert False, result.stderr
+                    yield (
+                        Path(file_path).relative_to(workspace),
+                        f"Output: {result.stdout}\n",
+                    )
+                else:
+                    with open(file_path, "r") as f:
+                        yield Path(file_path).relative_to(workspace), f.read()
+        else:
+            if ground.eval.type == "pytest":
+                result = subprocess.run(
+                    [sys.executable, "-m", "pytest"],
+                    cwd=os.path.abspath(workspace),
+                    capture_output=True,
+                    text=True,
+                )
+                if "error" in result.stderr or result.returncode != 0:
+                    print(result.stderr)
+                    assert False, result.stderr
+                yield "pytest", f"Output: {result.stdout}\n"
+
+    @staticmethod
+    def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None:
+        print(f"{Fore.BLUE}Scoring content:{Style.RESET_ALL}", content)
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if not ground.case_sensitive:
+                    should_contain_word = should_contain_word.lower()
+                    content = content.lower()
+                print_content = (
+                    f"{Fore.BLUE}Word that should exist{Style.RESET_ALL}"
+                    f" - {should_contain_word}:"
+                )
+                if should_contain_word not in content:
+                    print(print_content, "False")
+                    return 0.0
+                else:
+                    print(print_content, "True")
+                    return 1.0
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if not ground.case_sensitive:
+                    should_not_contain_word = should_not_contain_word.lower()
+                    content = content.lower()
+                print_content = (
+                    f"{Fore.BLUE}Word that should not exist{Style.RESET_ALL}"
+                    f" - {should_not_contain_word}:"
+                )
+                if should_not_contain_word in content:
+                    print(print_content, "False")
+                    return 0.0
+                else:
+                    print(print_content, "True")
+                    return 1.0
+
+    @classmethod
+    def score_result_with_llm(
+        cls, content: str, ground: BuiltinChallengeSpec.Ground
+    ) -> float:
+        if os.getenv("IS_MOCK"):
+            return 1.0
+
+        # the validation for this is done in the Eval BaseModel
+        scoring = SCORING_MAP[ground.eval.scoring]  # type: ignore
+        prompt = PROMPT_MAP[ground.eval.template].format(  # type: ignore
+            task=cls._spec.task, scoring=scoring, answer=ground.answer, response=content
+        )
+
+        if ground.eval.examples:
+            prompt += FEW_SHOT_EXAMPLES.format(examples=ground.eval.examples)
+
+        prompt += END_PROMPT
+
+        answer = get_openai_client().chat.completions.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": prompt},
+            ],
+        )
+
+        return float(answer.choices[0].message.content)  # type: ignore
+
+
+def load_builtin_challenges() -> Iterator[type[BuiltinChallenge]]:
+    logger.info("Loading built-in challenges...")
+
+    challenges_path = os.path.dirname(__file__)
+    logger.debug(f"Looking for challenge spec files in {challenges_path}...")
+
+    json_files = deque(
+        glob.glob(
+            f"{challenges_path}/**/data.json",
+            recursive=True,
+        )
+    )
+
+    logger.debug(f"Found {len(json_files)} built-in challenges.")
+
+    loaded, ignored = 0, 0
+    while json_files:
+        # Take and remove the first element from json_files
+        json_file = json_files.popleft()
+        if _challenge_should_be_ignored(json_file):
+            ignored += 1
+            continue
+
+        challenge = BuiltinChallenge.from_challenge_spec_file(Path(json_file))
+        logger.debug(f"Generated test for {challenge.info.name}")
+        yield challenge
+
+        loaded += 1
+
+    logger.info(
+        f"Loading built-in challenges complete: loaded {loaded}, ignored {ignored}."
+    )
+
+
+def _challenge_should_be_ignored(json_file_path: str):
+    return (
+        "challenges/deprecated" in json_file_path
+        or "challenges/library" in json_file_path
+    )
--- a/benchmark/agbenchmark/config.py
+++ b/benchmark/agbenchmark/config.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from pathlib import Path
 from typing import Optional

-from pydantic import BaseSettings
+from pydantic import BaseSettings, Field


 def _calculate_info_test_path(base_path: Path, benchmark_start_time: datetime) -> Path:
@@ -57,7 +57,7 @@ class AgentBenchmarkConfig(BaseSettings, extra="allow"):
    subject application exposes an Agent Protocol compliant API.
    """

-    agbenchmark_config_dir: Path
+    agbenchmark_config_dir: Path = Field(..., exclude=True)
    """Path to the agbenchmark_config folder of the subject agent application."""

    categories: list[str] | None = None
--- a/benchmark/agbenchmark/conftest.py
+++ b/benchmark/agbenchmark/conftest.py
@@ -6,17 +6,18 @@ import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Any, Generator
+from typing import Generator

 import pytest

+from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
 from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.reports.ReportManager import RegressionTestsTracker
 from agbenchmark.reports.reports import (
-    finalize_reports,
-    generate_single_call_report,
+    finalize_test_report,
+    initialize_test_report,
    session_finish,
 )
-from agbenchmark.utils.challenge import Challenge
 from agbenchmark.utils.data_types import Category

 GLOBAL_TIMEOUT = (
@@ -28,7 +29,6 @@ logger = logging.getLogger(__name__)

 pytest_plugins = ["agbenchmark.utils.dependencies"]
 collect_ignore = ["challenges"]
-suite_reports: dict[str, list] = {}


@pytest.fixture(scope="module")
@@ -118,18 +118,18 @@ def check_regression(request: pytest.FixtureRequest) -> None:
        request: The request object from which the test name and the benchmark
            configuration are retrieved.
    """
-    test_name = request.node.parent.name
    with contextlib.suppress(FileNotFoundError):
-        regression_report = agbenchmark_config.regression_tests_file
-        data = json.loads(regression_report.read_bytes())
-        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
+        rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)

+        test_name = request.node.parent.name
+        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
        skip_string = f"Skipping {test_name} at {challenge_location}"

        # Check if the test name exists in the regression tests
-        if request.config.getoption("--improve") and data.get(test_name, None):
+        is_regression_test = rt_tracker.has_regression_test(test_name)
+        if request.config.getoption("--improve") and is_regression_test:
            pytest.skip(f"{skip_string} because it's a regression test")
-        elif request.config.getoption("--maintain") and not data.get(test_name, None):
+        elif request.config.getoption("--maintain") and not is_regression_test:
            pytest.skip(f"{skip_string} because it's not a regression test")


@@ -149,24 +149,6 @@ def mock(request: pytest.FixtureRequest) -> bool:
    return request.config.getoption("--mock")


-@pytest.fixture(autouse=True, scope="function")
-def timer(request: pytest.FixtureRequest) -> Generator[None, None, None]:
-    """
-    Pytest fixture that times the execution of each test.
-    At the start of each test, it records the current time.
-    After the test function completes, it calculates the run time and adds it to
-    the test node's `user_properties`.
-
-    Args:
-        request: The `pytest.FixtureRequest` object through which the run time is stored
-            in the test node's `user_properties`.
-    """
-    start_time = time.time()
-    yield
-    run_time = time.time() - start_time
-    request.node.user_properties.append(("run_time", run_time))
-
-
 def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
    """
    Pytest hook that is called when a test report is being generated.
@@ -176,21 +158,15 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
        item: The test item for which the report is being generated.
        call: The call object from which the test result is retrieved.
    """
-    challenge: type[Challenge] = item.cls  # type: ignore
-    challenge_data = challenge.data
-    challenge_location = challenge.CHALLENGE_LOCATION
+    challenge: type[BaseChallenge] = item.cls  # type: ignore
+
+    if call.when == "setup":
+        test_name = item.nodeid.split("::")[1]
+        item.user_properties.append(("test_name", test_name))
+        initialize_test_report(item, challenge.info)

    if call.when == "call":
-        answers = getattr(item, "answers", None)
-        test_name = item.nodeid.split("::")[1]
-        item.test_name = test_name
-
-        generate_single_call_report(
-            item, call, challenge_data, answers, challenge_location, test_name
-        )
-
-    if call.when == "teardown":
-        finalize_reports(agbenchmark_config, item, challenge_data)
+        finalize_test_report(item, call, agbenchmark_config)


 def timeout_monitor(start_time: int) -> None:
@@ -226,21 +202,7 @@ def pytest_sessionfinish(session: pytest.Session) -> None:

    Finalizes and saves the test reports.
    """
-    session_finish(agbenchmark_config, suite_reports)
-
-
-@pytest.fixture
-def scores(request: pytest.FixtureRequest) -> None:
-    """
-    Pytest fixture that retrieves the scores of the test class.
-    The scores are retrieved from the `Challenge.scores` attribute
-    using the test class name.
-
-    Args:
-        request: The request object.
-    """
-    challenge: type[Challenge] = request.node.cls
-    return challenge.scores.get(challenge.__name__)
+    session_finish(agbenchmark_config)


 def pytest_collection_modifyitems(
@@ -255,10 +217,7 @@ def pytest_collection_modifyitems(
        items: The collected test items to be modified.
        config: The active pytest configuration.
    """
-    regression_file = agbenchmark_config.regression_tests_file
-    regression_tests: dict[str, Any] = (
-        json.loads(regression_file.read_bytes()) if regression_file.is_file() else {}
-    )
+    rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)

    try:
        challenges_beaten_in_the_past = json.loads(
@@ -277,7 +236,7 @@ def pytest_collection_modifyitems(
        challenge = item.cls
        challenge_name = item.cls.__name__

-        if not issubclass(challenge, Challenge):
+        if not issubclass(challenge, BaseChallenge):
            item.warn(
                pytest.PytestCollectionWarning(
                    f"Non-challenge item collected: {challenge}"
@@ -287,7 +246,7 @@ def pytest_collection_modifyitems(
            continue

        # --test: remove the test from the set if it's not specifically selected
-        if selected_tests and challenge.data.name not in selected_tests:
+        if selected_tests and challenge.info.name not in selected_tests:
            items.remove(item)
            continue

@@ -295,8 +254,8 @@ def pytest_collection_modifyitems(
        # --maintain -> only challenges expected to be passed (= regression tests)
        # --improve -> only challenges that so far are not passed (reliably)
        # --explore -> only challenges that have never been passed
-        is_regression_test = regression_tests.get(challenge.data.name, None)
-        has_been_passed = challenges_beaten_in_the_past.get(challenge.data.name, False)
+        is_regression_test = rt_tracker.has_regression_test(challenge.info.name)
+        has_been_passed = challenges_beaten_in_the_past.get(challenge.info.name, False)
        if (
            (config.getoption("--maintain") and not is_regression_test)
            or (config.getoption("--improve") and is_regression_test)
@@ -305,7 +264,7 @@ def pytest_collection_modifyitems(
            items.remove(item)
            continue

-        dependencies = challenge.data.dependencies
+        dependencies = challenge.info.dependencies
        if (
            config.getoption("--test")
            or config.getoption("--no-dep")
@@ -319,17 +278,17 @@ def pytest_collection_modifyitems(
        elif config.getoption("--improve"):
            # Filter dependencies, keep only deps that are not "regression" tests
            dependencies = [
-                d for d in dependencies if not regression_tests.get(d, None)
+                d for d in dependencies if not rt_tracker.has_regression_test(d)
            ]

        # Set category markers
-        challenge_categories = [c.value for c in challenge.data.category]
+        challenge_categories = set(c.value for c in challenge.info.category)
        for category in challenge_categories:
            item.add_marker(category)

        # Enforce category selection
        if selected_categories:
-            if not set(challenge_categories).intersection(set(selected_categories)):
+            if not challenge_categories.intersection(set(selected_categories)):
                items.remove(item)
                continue
            # # Filter dependencies, keep only deps from selected categories
@@ -338,6 +297,22 @@ def pytest_collection_modifyitems(
            #     if not set(d.categories).intersection(set(selected_categories))
            # ]

+        # Skip items in optional categories that are not selected for the subject agent
+        challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES)
+        if challenge_optional_categories and not (
+            agbenchmark_config.categories
+            and challenge_optional_categories.issubset(
+                set(agbenchmark_config.categories)
+            )
+        ):
+            logger.debug(
+                f"Skipping {challenge_name}: "
+                f"category {' and '.join(challenge_optional_categories)} is optional, "
+                "and not explicitly selected in the benchmark config."
+            )
+            items.remove(item)
+            continue
+
        # Add marker for the DependencyManager
        item.add_marker(pytest.mark.depends(on=dependencies, name=challenge_name))

--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -1,75 +1,24 @@
-import glob
+"""
+AGBenchmark's test discovery endpoint for Pytest.
+
+This module is picked up by Pytest's *_test.py file matching pattern, and all challenge
+classes in the module that conform to the `Test*` pattern are collected.
+"""
+
 import importlib
 import logging
-import os
-from collections import deque
-from pathlib import Path

-from agbenchmark.utils.challenge import Challenge
-from agbenchmark.utils.data_types import ChallengeData
-
-DATA_CATEGORY = {}
+from agbenchmark.challenges.builtin import load_builtin_challenges

 logger = logging.getLogger(__name__)

+DATA_CATEGORY = {}

-def create_challenge_from_spec_file(spec_file: Path) -> type[Challenge]:
-    challenge = Challenge.from_challenge_spec(spec_file)
-    DATA_CATEGORY[challenge.data.name] = challenge.data.category[0].value
-    return challenge
-
-
-def create_challenge_from_spec_file_path(spec_file_path: str) -> type[Challenge]:
-    spec_file = Path(spec_file_path).resolve()
-    return create_challenge_from_spec_file(spec_file)
-
-
-def load_challenges() -> None:
-    logger.info("Loading challenges...")
-
-    challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
-    logger.debug(f"Looking for challenges in {challenges_path}...")
-
-    json_files = deque(
-        glob.glob(
-            f"{challenges_path}/**/data.json",
-            recursive=True,
-        )
-    )
-
-    logger.debug(f"Found {len(json_files)} challenges.")
-    logger.debug(f"Sample path: {json_files[0]}")
-
-    loaded, ignored = 0, 0
-    while json_files:
-        # Take and remove the first element from json_files
-        json_file = json_files.popleft()
-        if challenge_should_be_ignored(json_file):
-            ignored += 1
-            continue
-
-        challenge_info = ChallengeData.parse_file(json_file)
-
-        challenge_class = create_challenge_from_spec_file_path(json_file)
-
-        logger.debug(f"Generated test for {challenge_info.name}")
-        _add_challenge_to_module(challenge_class)
-        loaded += 1
-
-    logger.info(f"Loading challenges complete: loaded {loaded}, ignored {ignored}.")
-
-
-def challenge_should_be_ignored(json_file_path: str):
-    return (
-        "challenges/deprecated" in json_file_path
-        or "challenges/library" in json_file_path
-    )
-
-
-def _add_challenge_to_module(challenge: type[Challenge]):
+# Load challenges and attach them to this module
+for challenge in load_builtin_challenges():
    # Attach the Challenge class to this module so it can be discovered by pytest
    module = importlib.import_module(__name__)
-    setattr(module, f"{challenge.__name__}", challenge)
+    setattr(module, challenge.__name__, challenge)

-
-load_challenges()
+    # Build a map of challenge names and their primary category
+    DATA_CATEGORY[challenge.info.name] = challenge.info.category[0].value
--- a/benchmark/agbenchmark/reports/ReportManager.py
+++ b/benchmark/agbenchmark/reports/ReportManager.py
@@ -1,21 +1,29 @@
 import copy
 import json
+import logging
 import os
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import Any

 from agbenchmark.config import AgentBenchmarkConfig
 from agbenchmark.reports.processing.graphs import save_single_radar_chart
 from agbenchmark.reports.processing.process_report import get_agent_category
-from agbenchmark.reports.processing.report_types import Report
+from agbenchmark.reports.processing.report_types import MetricsOverall, Report, Test
 from agbenchmark.utils.utils import get_highest_success_difficulty

+logger = logging.getLogger(__name__)
+

 class SingletonReportManager:
    instance = None

+    INFO_MANAGER: "SessionReportManager"
+    REGRESSION_MANAGER: "RegressionTestsTracker"
+    SUCCESS_RATE_TRACKER: "SuccessRatesTracker"
+
    def __new__(cls):
        if not cls.instance:
            cls.instance = super(SingletonReportManager, cls).__new__(cls)
@@ -26,17 +34,16 @@ class SingletonReportManager:
            )  # or any logic to fetch the datetime

            # Make the Managers class attributes
-            cls.REGRESSION_MANAGER = ReportManager(
-                agent_benchmark_config.regression_tests_file,
-                benchmark_start_time_dt,
-            )
-            cls.INFO_MANAGER = ReportManager(
+            cls.INFO_MANAGER = SessionReportManager(
                agent_benchmark_config.get_report_dir(benchmark_start_time_dt)
                / "report.json",
                benchmark_start_time_dt,
            )
-            cls.INTERNAL_INFO_MANAGER = ReportManager(
-                agent_benchmark_config.success_rate_file, benchmark_start_time_dt
+            cls.REGRESSION_MANAGER = RegressionTestsTracker(
+                agent_benchmark_config.regression_tests_file
+            )
+            cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker(
+                agent_benchmark_config.success_rate_file
            )

        return cls.instance
@@ -44,39 +51,33 @@ class SingletonReportManager:
    @classmethod
    def clear_instance(cls):
        cls.instance = None
-        cls.REGRESSION_MANAGER = None
        cls.INFO_MANAGER = None
-        cls.INTERNAL_INFO_MANAGER = None
+        cls.REGRESSION_MANAGER = None
+        cls.SUCCESS_RATE_TRACKER = None


-class ReportManager:
+class BaseReportManager:
    """Abstracts interaction with the regression tests file"""

-    def __init__(self, report_file: Path, benchmark_start_time: datetime):
+    tests: dict[str, Any]
+
+    def __init__(self, report_file: Path):
        self.report_file = report_file
-        self.start_time = time.time()
-        self.benchmark_start_time = benchmark_start_time

        self.load()

    def load(self) -> None:
        if not self.report_file.exists():
            self.report_file.parent.mkdir(exist_ok=True)
-            self.report_file.touch()

        try:
            with self.report_file.open("r") as f:
-                file_content = (
-                    f.read().strip()
-                )  # read the content and remove any leading/trailing whitespace
-                if file_content:  # if file is not empty, load the json
-                    data = json.loads(file_content)
-                    self.tests = {k: data[k] for k in sorted(data)}
-                else:  # if file is empty, assign an empty dictionary
-                    self.tests = {}
+                data = json.load(f)
+                self.tests = {k: data[k] for k in sorted(data)}
        except FileNotFoundError:
            self.tests = {}
-        except json.decoder.JSONDecodeError:  # If JSON is invalid
+        except json.decoder.JSONDecodeError as e:
+            logger.warning(f"Could not parse {self.report_file}: {e}")
            self.tests = {}
        self.save()

@@ -84,13 +85,6 @@ class ReportManager:
        with self.report_file.open("w") as f:
            json.dump(self.tests, f, indent=4)

-    def add_test(self, test_name: str, test_details: dict | list) -> None:
-        if test_name.startswith("Test"):
-            test_name = test_name[4:]
-        self.tests[test_name] = test_details
-
-        self.save()
-
    def remove_test(self, test_name: str) -> None:
        if test_name in self.tests:
            del self.tests[test_name]
@@ -100,34 +94,61 @@ class ReportManager:
        self.tests = {}
        self.save()

-    def end_info_report(self, config: AgentBenchmarkConfig) -> None:
+
+class SessionReportManager(BaseReportManager):
+    """Abstracts interaction with the regression tests file"""
+
+    tests: dict[str, Test] | Report
+
+    def __init__(self, report_file: Path, benchmark_start_time: datetime):
+        super().__init__(report_file)
+
+        self.start_time = time.time()
+        self.benchmark_start_time = benchmark_start_time
+
+    def save(self) -> None:
+        with self.report_file.open("w") as f:
+            if isinstance(self.tests, Report):
+                f.write(self.tests.json(indent=4))
+            else:
+                json.dump({k: v.dict() for k, v in self.tests.items()}, f, indent=4)
+
+    def add_test_report(self, test_name: str, test_report: Test) -> None:
+        if isinstance(self.tests, Report):
+            raise RuntimeError("Session report already finalized")
+
+        if test_name.startswith("Test"):
+            test_name = test_name[4:]
+        self.tests[test_name] = test_report
+
+        self.save()
+
+    def finalize_session_report(self, config: AgentBenchmarkConfig) -> None:
        command = " ".join(sys.argv)

-        self.tests = {
-            "command": command.split(os.sep)[-1],
-            "benchmark_git_commit_sha": "---",
-            "agent_git_commit_sha": "---",
-            "completion_time": datetime.now(timezone.utc).strftime(
+        if isinstance(self.tests, Report):
+            raise RuntimeError("Session report already finalized")
+
+        self.tests = Report(
+            command=command.split(os.sep)[-1],
+            benchmark_git_commit_sha="---",
+            agent_git_commit_sha="---",
+            completion_time=datetime.now(timezone.utc).strftime(
                "%Y-%m-%dT%H:%M:%S+00:00"
            ),
-            "benchmark_start_time": self.benchmark_start_time.strftime(
+            benchmark_start_time=self.benchmark_start_time.strftime(
                "%Y-%m-%dT%H:%M:%S+00:00"
            ),
-            "metrics": {
-                "run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
-                "highest_difficulty": get_highest_success_difficulty(self.tests),
-                "total_cost": self.get_total_costs(),
-            },
-            "tests": copy.copy(self.tests),
-            "config": {
-                k: v for k, v in json.loads(config.json()).items() if v is not None
-            },
-        }
-        Report.parse_obj(self.tests)
+            metrics=MetricsOverall(
+                run_time=str(round(time.time() - self.start_time, 2)) + " seconds",
+                highest_difficulty=get_highest_success_difficulty(self.tests),
+                total_cost=self.get_total_costs(),
+            ),
+            tests=copy.copy(self.tests),
+            config=config.dict(exclude_none=True),
+        )

-        converted_data = Report.parse_obj(self.tests)
-
-        agent_categories = get_agent_category(converted_data)
+        agent_categories = get_agent_category(self.tests)
        if len(agent_categories) > 1:
            save_single_radar_chart(
                agent_categories,
@@ -137,12 +158,15 @@ class ReportManager:
        self.save()

    def get_total_costs(self):
+        if isinstance(self.tests, Report):
+            tests = self.tests.tests
+        else:
+            tests = self.tests
+
        total_cost = 0
        all_costs_none = True
-        for test_name, test_data in self.tests.items():
-            cost = test_data["metrics"].get(
-                "cost", 0
-            )  # gets the cost or defaults to 0 if cost is missing
+        for test_data in tests.values():
+            cost = test_data.metrics.cost or 0.0

            if cost is not None:  # check if cost is not None
                all_costs_none = False
@@ -150,3 +174,32 @@ class ReportManager:
        if all_costs_none:
            total_cost = None
        return total_cost
+
+
+class RegressionTestsTracker(BaseReportManager):
+    """Abstracts interaction with the regression tests file"""
+
+    tests: dict[str, dict]
+
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        if test_name.startswith("Test"):
+            test_name = test_name[4:]
+        self.tests[test_name] = test_details
+
+        self.save()
+
+    def has_regression_test(self, test_name: str) -> bool:
+        return self.tests.get(test_name) is not None
+
+
+class SuccessRatesTracker(BaseReportManager):
+    """Abstracts interaction with the regression tests file"""
+
+    tests: dict[str, list[bool]]
+
+    def update(self, test_name: str, success_history: list[bool]) -> None:
+        if test_name.startswith("Test"):
+            test_name = test_name[4:]
+        self.tests[test_name] = success_history
+
+        self.save()
--- a/benchmark/agbenchmark/reports/processing/process_report.py
+++ b/benchmark/agbenchmark/reports/processing/process_report.py
@@ -46,7 +46,7 @@ def get_agent_category(report: Report) -> dict[str, Any]:
            ):
                continue
            categories.setdefault(category, 0)
-            if data.metrics.success:
+            if data.metrics.success and data.metrics.difficulty:
                num_dif = STRING_DIFFICULTY_MAP[data.metrics.difficulty]
                if num_dif > categories[category]:
                    categories[category] = num_dif
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -1,48 +1,38 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, constr, validator

 datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
-from pydantic import BaseModel, constr


-class ForbidOptionalMeta(type(BaseModel)):  # metaclass to forbid optional fields
-    def __new__(cls, name: str, bases: tuple, dct: Dict[str, Any]) -> Any:
-        for attr_name, attr_value in dct.items():
-            if (
-                getattr(attr_value, "__origin__", None) == Union
-                and type(None) in attr_value.__args__
-            ):
-                raise TypeError(
-                    f"Optional fields are forbidden, but found in {attr_name}"
-                )
-
-        return super().__new__(cls, name, bases, dct)
-
-
-class BaseModelBenchmark(BaseModel, metaclass=ForbidOptionalMeta):
-    class Config:
-        extra = "forbid"
-
-
-class Metrics(BaseModelBenchmark):
-    difficulty: str
-    success: bool
-    success_percentage: float = Field(..., alias="success_%")
-    run_time: str
-    fail_reason: str | None
+class Metrics(BaseModel):
+    difficulty: str | None
+    success: bool | None = None
+    run_time: str | None = None
+    fail_reason: str | None = None
+    success_percentage: float | None = Field(default=None, alias="success_%")
    attempted: bool
-    cost: float | None
+    cost: float | None = None
+
+    @validator("attempted")
+    def require_metrics_if_attempted(cls, v: bool, values: dict[str, Any]):
+        required_fields_if_attempted = ["success", "run_time"]
+        if v:
+            for f in required_fields_if_attempted:
+                assert (
+                    values.get(f) is not None
+                ), f"'{f}' must be defined if attempted is True"
+        return v


-class MetricsOverall(BaseModelBenchmark):
+class MetricsOverall(BaseModel):
    run_time: str
    highest_difficulty: str
-    percentage: float | None
-    total_cost: float | None
+    percentage: float | None = None
+    total_cost: float | None = None


-class Test(BaseModelBenchmark):
+class Test(BaseModel):
    data_path: str
    is_regression: bool
    answer: str
@@ -50,19 +40,19 @@ class Test(BaseModelBenchmark):
    metrics: Metrics
    category: List[str]
    task: str
-    reached_cutoff: bool
-    metadata: Any
+    reached_cutoff: bool | None = None  # None if in progress
+    metadata: dict[str, Any] | None = Field(default_factory=dict)


-class ReportBase(BaseModelBenchmark):
+class ReportBase(BaseModel):
    command: str
-    completion_time: str | None
+    completion_time: str | None = None
    benchmark_start_time: constr(regex=datetime_format)
    metrics: MetricsOverall
    config: Dict[str, str | dict[str, str]]
-    agent_git_commit_sha: str | None
-    benchmark_git_commit_sha: str | None
-    repo_url: str | None
+    agent_git_commit_sha: str | None = None
+    benchmark_git_commit_sha: str | None = None
+    repo_url: str | None = None


 class Report(ReportBase):
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -3,13 +3,14 @@ import logging
 import os
 import sys
 from pathlib import Path
-from typing import Any, Dict

 import pytest

+from agbenchmark.challenges import ChallengeInfo
 from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.reports.processing.report_types import Metrics, Test
 from agbenchmark.reports.ReportManager import SingletonReportManager
-from agbenchmark.utils.data_types import ChallengeData, DifficultyLevel
+from agbenchmark.utils.data_types import DifficultyLevel
 from agbenchmark.utils.utils import calculate_success_percentage

 # from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
@@ -17,24 +18,22 @@ from agbenchmark.utils.utils import calculate_success_percentage
 logger = logging.getLogger(__name__)


-def get_previous_test_results(
-    test_name: str, info_details: dict[str, Any]
-) -> list[bool]:
+def get_and_update_success_history(test_name: str, info_details: Test) -> list[bool]:
    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv

-    prev_test_results = SingletonReportManager().INTERNAL_INFO_MANAGER.tests.get(
+    prev_test_results = SingletonReportManager().SUCCESS_RATE_TRACKER.tests.get(
        test_name, []
    )

-    if not mock:
+    if not mock and info_details.metrics.success is not None:
        # only add if it's an actual test
-        prev_test_results.append(info_details["metrics"]["success"])
-        SingletonReportManager().INTERNAL_INFO_MANAGER.add_test(
+        prev_test_results.append(info_details.metrics.success)
+        SingletonReportManager().SUCCESS_RATE_TRACKER.update(
            test_name, prev_test_results
        )

    # can calculate success rate regardless of mock
-    info_details["metrics"]["success_%"] = calculate_success_percentage(
+    info_details.metrics.success_percentage = calculate_success_percentage(
        prev_test_results
    )

@@ -43,26 +42,22 @@ def get_previous_test_results(

 def update_regression_tests(
    prev_test_results: list[bool],
-    info_details: dict,
+    info_details: Test,
    test_name: str,
-    test_details: dict,
 ) -> None:
    if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
        # if the last 3 tests were successful, add to the regression tests
-        info_details["is_regression"] = True
-        SingletonReportManager().REGRESSION_MANAGER.add_test(test_name, test_details)
+        info_details.is_regression = True
+        SingletonReportManager().REGRESSION_MANAGER.add_test(
+            test_name, info_details.dict(include={"difficulty", "data_path"})
+        )


-def generate_single_call_report(
+def initialize_test_report(
    item: pytest.Item,
-    call: pytest.CallInfo,
-    challenge_data: ChallengeData,
-    answers: dict[str, Any],
-    challenge_location: str,
-    test_name: str,
-) -> None:
-    difficulty = challenge_data.info.difficulty
-
+    challenge_info: ChallengeInfo,
+):
+    difficulty = challenge_info.difficulty
    if isinstance(difficulty, DifficultyLevel):
        difficulty = difficulty.value

@@ -71,105 +66,73 @@ def generate_single_call_report(
    # test_name = item.nodeid.split("::")[1]
    # item.test_name = test_name

-    test_details = {
-        "difficulty": difficulty,
-        "data_path": challenge_location,
-    }
-
-    info_details: Any = {
-        "data_path": challenge_location,
-        "is_regression": False,
-        "category": challenge_data.category,
-        "task": challenge_data.task,
-        "answer": challenge_data.ground.answer,
-        "description": challenge_data.info.description,
-        "metrics": {
-            "difficulty": difficulty,
-            "success": False,
-            "attempted": True,
-        },
-        # "answers": answers,
-    }
-    if answers:
-        info_details["answers"] = answers
-
-    if challenge_data.metadata:
-        info_details["metadata"] = challenge_data.metadata
-
-    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv
-    if call:
-        if call.excinfo is None:
-            info_details["metrics"]["success"] = True
-        else:
-            if not mock:  # don't remove if it's a mock test
-                SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
-            info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
-            if call.excinfo.typename == "Skipped":
-                info_details["metrics"]["attempted"] = False
-
-    prev_test_results: list[bool] = get_previous_test_results(test_name, info_details)
-
-    update_regression_tests(prev_test_results, info_details, test_name, test_details)
+    test_info = dict(item.user_properties).get("info_details") or Test(
+        data_path=challenge_info.source_uri,
+        is_regression=False,
+        category=[c.value for c in challenge_info.category],
+        task=challenge_info.task,
+        answer=challenge_info.reference_answer or "",
+        description=challenge_info.description or "",
+        metrics=Metrics(
+            difficulty=difficulty,
+            attempted=False,
+        ),
+    )

    # user facing reporting
    if item:
-        item.info_details = info_details
+        item.user_properties.append(("info_details", test_info))

-    return info_details
+    return test_info


-def finalize_reports(
-    config: AgentBenchmarkConfig, item: pytest.Item, challenge_data: ChallengeData
+def finalize_test_report(
+    item: pytest.Item, call: pytest.CallInfo, config: AgentBenchmarkConfig
 ) -> None:
-    run_time = dict(item.user_properties).get("run_time")
+    user_properties: dict = dict(item.user_properties)

-    info_details = getattr(item, "info_details", {})
-    test_name = getattr(item, "test_name", "")
+    info_details: Test = user_properties.get("info_details", {})
+    test_name: str = user_properties.get("test_name", "")
+
+    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv
+
+    logger.debug(f"Finalizing report with CallInfo: {vars(call)}")
+    if call.excinfo is None:
+        info_details.metrics.success = True
+    else:
+        if not mock:  # don't remove if it's a mock test
+            SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
+        info_details.metrics.fail_reason = str(call.excinfo.value)
+        if call.excinfo.typename == "Skipped":
+            info_details.metrics.attempted = False
+    info_details.metrics.attempted = True
+    info_details.metrics.run_time = f"{str(round(call.duration, 3))} seconds"
+    info_details.reached_cutoff = user_properties.get("timed_out", False)
+
+    prev_test_results: list[bool] = get_and_update_success_history(
+        test_name, info_details
+    )
+
+    update_regression_tests(prev_test_results, info_details, test_name)

    if info_details and test_name:
-        if run_time is not None:
-            cost = None
-            # if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
-            #     logger.debug("Getting cost from Helicone")
-            #     cost = get_data_from_helicone(test_name)
-            #     logger.debug(f"Cost: {cost}")
+        # if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
+        #     logger.debug("Getting cost from Helicone")
+        #     info_details.metrics.cost = get_data_from_helicone(test_name)
+        #     logger.debug(f"Cost: {cost}")

-            info_details["metrics"]["cost"] = cost
+        if "--mock" not in sys.argv:
+            update_challenges_already_beaten(
+                config.challenges_already_beaten_file, info_details, test_name
+            )

-            if info_details["metrics"].get("success", None) is None:
-                info_details["metrics"]["attempted"] = False
-                info_details["metrics"]["success"] = False
-            elif (
-                info_details["metrics"].get("success") is False
-                and "attempted" not in info_details["metrics"]
-            ):
-                info_details["metrics"]["attempted"] = False
-
-            info_details["metrics"]["run_time"] = f"{str(round(run_time, 3))} seconds"
-
-            info_details["reached_cutoff"] = float(run_time) > challenge_data.cutoff
-
-            if "--mock" not in sys.argv:
-                update_challenges_already_beaten(
-                    config.challenges_already_beaten_file, info_details, test_name
-                )
-                if info_details.get("tests") is not None:
-                    for nested_test_name, nested_test_info in info_details[
-                        "tests"
-                    ].items():
-                        update_challenges_already_beaten(
-                            config.challenges_already_beaten_file,
-                            nested_test_info,
-                            nested_test_name,
-                        )
-
-        SingletonReportManager().INFO_MANAGER.add_test(test_name, info_details)
+        SingletonReportManager().INFO_MANAGER.add_test_report(test_name, info_details)


 def update_challenges_already_beaten(
-    challenges_already_beaten_file: Path, info_details: Dict[str, Any], test_name: str
+    challenges_already_beaten_file: Path, info_details: Test, test_name: str
 ) -> None:
-    current_run_successful = info_details["metrics"]["success"]
+    current_run_successful = info_details.metrics.success
    try:
        with open(challenges_already_beaten_file, "r") as f:
            challenge_data = json.load(f)
@@ -185,9 +148,7 @@ def update_challenges_already_beaten(
        json.dump(challenge_data, f, indent=4)


-def session_finish(
-    agbenchmark_config: AgentBenchmarkConfig, suite_reports: dict
-) -> None:
-    SingletonReportManager().INTERNAL_INFO_MANAGER.save()
-    SingletonReportManager().INFO_MANAGER.end_info_report(agbenchmark_config)
+def session_finish(agbenchmark_config: AgentBenchmarkConfig) -> None:
+    SingletonReportManager().INFO_MANAGER.finalize_session_report(agbenchmark_config)
    SingletonReportManager().REGRESSION_MANAGER.save()
+    SingletonReportManager().SUCCESS_RATE_TRACKER.save()
--- a/benchmark/agbenchmark/utils/challenge.py
+++ b/benchmark/agbenchmark/utils/challenge.py
@@ -1,284 +0,0 @@
-import glob
-import json
-import logging
-import math
-import os
-import subprocess
-import sys
-from abc import ABC
-from pathlib import Path
-from typing import Any, ClassVar, List
-
-import pytest
-from colorama import Fore, Style
-from openai import OpenAI
-
-from agbenchmark.agent_api_interface import run_api_agent
-from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import ChallengeData, Ground
-from agbenchmark.utils.prompts import (
-    END_PROMPT,
-    FEW_SHOT_EXAMPLES,
-    PROMPT_MAP,
-    SCORING_MAP,
-)
-
-logger = logging.getLogger(__name__)
-
-with open(
-    Path(__file__).parent.parent / "challenges" / "optional_categories.json"
-) as f:
-    OPTIONAL_CATEGORIES: list[str] = json.load(f)["optional_categories"]
-
-
-class Challenge(ABC):
-    """The parent class to all specific challenges classes.
-    Defines helper methods for running a challenge"""
-
-    data: ChallengeData
-    CHALLENGE_LOCATION: ClassVar[str]
-    ARTIFACTS_LOCATION: ClassVar[str]
-    scores: ClassVar[dict[str, Any]] = {}  # this is for suites
-
-    @staticmethod
-    def from_challenge_spec(spec_file: Path) -> type["Challenge"]:
-        challenge_data = ChallengeData.parse_file(spec_file)
-
-        challenge_class_name = f"Test{challenge_data.name}"
-        logger.debug(f"Creating {challenge_class_name} from spec: {spec_file}")
-        return type(
-            challenge_class_name,
-            (Challenge,),
-            {
-                "data": challenge_data,
-                "CHALLENGE_LOCATION": str(spec_file),
-                "ARTIFACTS_LOCATION": str(spec_file.resolve().parent),
-            },
-        )
-
-    # Define test method within the dynamically created class
-    @pytest.mark.asyncio
-    async def test_method(
-        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
-    ) -> None:
-        # skip optional categories
-        self.skip_optional_categories(config)
-
-        # if os.environ.get("HELICONE_API_KEY"):
-        #     from helicone.lock import HeliconeLockManager
-
-        #     HeliconeLockManager.write_custom_property("challenge", self.data.name)
-
-        timeout = self.data.cutoff or 60
-
-        if request.config.getoption("--nc"):
-            timeout = 100000
-        elif cutoff := request.config.getoption("--cutoff"):
-            timeout = int(cutoff)
-
-        await self.run_challenge(config, timeout)
-
-        scores = self.get_scores(config.temp_folder)
-        request.node.answers = (
-            scores["answers"] if request.config.getoption("--keep-answers") else None
-        )
-        del scores["answers"]  # remove answers from scores
-        request.node.scores = scores  # store scores in request.node
-        is_score_100 = 1 in scores["values"]
-
-        assert is_score_100
-
-    async def run_challenge(self, config: AgentBenchmarkConfig, cutoff: int) -> None:
-        from agbenchmark.agent_interface import copy_artifacts_into_temp_folder
-
-        if not self.data.task:
-            return
-
-        print(
-            f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} "
-            f"Starting {self.data.name} challenge"
-            f" {'='*24}{Style.RESET_ALL}"
-        )
-        print(f"{Fore.BLACK}Task: {self.data.task}{Fore.RESET}")
-
-        await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff)
-
-        # hidden files are added after the agent runs. Hidden files can be python test files.
-        # We copy them in the temporary folder to make it easy to import the code produced by the agent
-        artifact_paths = [
-            self.ARTIFACTS_LOCATION,
-            str(Path(self.CHALLENGE_LOCATION).parent),
-        ]
-        for path in artifact_paths:
-            copy_artifacts_into_temp_folder(config.temp_folder, "custom_python", path)
-
-    @staticmethod
-    def get_artifacts_out(
-        workspace: str | Path | dict[str, str], ground: Ground
-    ) -> List[str]:
-        if isinstance(workspace, dict):
-            workspace = workspace["output"]
-
-        script_dir = workspace
-        files_contents = []
-
-        for file_pattern in ground.files:
-            # Check if it is a file extension
-            if file_pattern.startswith("."):
-                # Find all files with the given extension in the workspace
-                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
-            else:
-                # Otherwise, it is a specific file
-                matching_files = [os.path.join(script_dir, file_pattern)]
-
-            for file_path in matching_files:
-                if ground.eval.type == "python":
-                    result = subprocess.run(
-                        [sys.executable, file_path],
-                        cwd=os.path.abspath(workspace),
-                        capture_output=True,
-                        text=True,
-                    )
-                    if "error" in result.stderr or result.returncode != 0:
-                        print(result.stderr)
-                        assert False, result.stderr
-                    files_contents.append(f"Output: {result.stdout}\n")
-                else:
-                    with open(file_path, "r") as f:
-                        files_contents.append(f.read())
-        else:
-            if ground.eval.type == "pytest":
-                result = subprocess.run(
-                    [sys.executable, "-m", "pytest"],
-                    cwd=os.path.abspath(workspace),
-                    capture_output=True,
-                    text=True,
-                )
-                if "error" in result.stderr or result.returncode != 0:
-                    print(result.stderr)
-                    assert False, result.stderr
-                files_contents.append(f"Output: {result.stdout}\n")
-
-        return files_contents
-
-    @staticmethod
-    def scoring(content: str, ground: Ground) -> float:
-        print(f"{Fore.BLUE}Scoring content:{Style.RESET_ALL}", content)
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if not getattr(ground, "case_sensitive", True):
-                    should_contain_word = should_contain_word.lower()
-                    content = content.lower()
-                print_content = (
-                    f"{Fore.BLUE}Word that should exist{Style.RESET_ALL}"
-                    f" - {should_contain_word}:"
-                )
-                if should_contain_word not in content:
-                    print(print_content, "False")
-                    return 0.0
-                else:
-                    print(print_content, "True")
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if not getattr(ground, "case_sensitive", True):
-                    should_not_contain_word = should_not_contain_word.lower()
-                    content = content.lower()
-                print_content = (
-                    f"{Fore.BLUE}Word that should not exist{Style.RESET_ALL}"
-                    f" - {should_not_contain_word}:"
-                )
-                if should_not_contain_word in content:
-                    print(print_content, "False")
-                    return 0.0
-                else:
-                    print(print_content, "True")
-
-        return 1.0
-
-    @classmethod
-    def llm_eval(cls, content: str, ground: Ground) -> float:
-        openai_client = OpenAI()
-        if os.getenv("IS_MOCK"):
-            return 1.0
-
-        # the validation for this is done in the Eval BaseModel
-        scoring = SCORING_MAP[ground.eval.scoring]  # type: ignore
-        prompt = PROMPT_MAP[ground.eval.template].format(  # type: ignore
-            task=cls.data.task, scoring=scoring, answer=ground.answer, response=content
-        )
-
-        if ground.eval.examples:
-            prompt += FEW_SHOT_EXAMPLES.format(examples=ground.eval.examples)
-
-        prompt += END_PROMPT
-
-        answer = openai_client.chat.completions.create(
-            model="gpt-4",
-            messages=[
-                {"role": "system", "content": prompt},
-            ],
-        )
-
-        return float(answer.choices[0].message.content)  # type: ignore
-
-    @classmethod
-    def get_scores(cls, workspace: Path) -> dict[str, Any]:
-        scores = []
-        scores_dict: Any = {}
-        percentage = None
-        answers = {}
-        try:
-            if cls.data.task == "" and os.getenv("IS_MOCK"):
-                scores = [1.0]
-                answers = {"mock": "This is a mock answer"}
-            elif isinstance(cls.data.ground, Ground):
-                files_contents = cls.get_artifacts_out(workspace, cls.data.ground)
-                answers = {"answer": files_contents}
-                for file_content in files_contents:
-                    score = cls.scoring(file_content, cls.data.ground)
-                    print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", score)
-                    scores.append(score)
-
-                if cls.data.ground.eval.type == "llm":
-                    llm_eval = cls.llm_eval("\n".join(files_contents), cls.data.ground)
-                    if cls.data.ground.eval.scoring == "percentage":
-                        scores.append(math.ceil(llm_eval / 100))
-                    elif cls.data.ground.eval.scoring == "scale":
-                        scores.append(math.ceil(llm_eval / 10))
-                    print(f"{Fore.GREEN}Your score is:{Style.RESET_ALL}", llm_eval)
-
-                    scores.append(llm_eval)
-        except Exception as e:
-            print("Error getting scores", e)
-
-        scores_data = {
-            "values": scores,
-            "scores_obj": scores_dict,
-            "percentage": percentage,
-            "answers": answers,
-        }
-
-        cls.scores[cls.__name__] = scores_data
-
-        return scores_data
-
-    def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
-        return 1  # remove this once this works
-        if 1 in scores.get("scores_obj", {}).get(test_name, []):
-            return 1
-
-        return None
-
-    @classmethod
-    def skip_optional_categories(cls, config: AgentBenchmarkConfig) -> None:
-        challenge_categories = set(c.value for c in cls.data.category)
-        challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES)
-        if challenge_optional_categories and not (
-            config.categories
-            and set(challenge_optional_categories).issubset(set(config.categories))
-        ):
-            pytest.skip(
-                f"Category {', '.join(challenge_optional_categories)} is optional, "
-                "and not explicitly selected in the benchmark config."
-            )
--- a/benchmark/agbenchmark/utils/data_types.py
+++ b/benchmark/agbenchmark/utils/data_types.py
@@ -1,8 +1,7 @@
 from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Literal

-from pydantic import BaseModel, Field, constr, validator
+from pydantic import BaseModel


 class DifficultyLevel(Enum):
@@ -29,87 +28,19 @@ DIFFICULTY_MAP = {
 STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}


-class Info(BaseModel):
-    difficulty: DifficultyLevel
-    description: constr(regex=r"^Tests if the agent can.*")
-    side_effects: List[str]
-
-    @validator("difficulty", pre=True)
-    def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
-        """Convert a string to an instance of DifficultyLevel."""
-        if isinstance(v, DifficultyLevel):
-            return v
-
-        if isinstance(v, str):
-            try:
-                return DifficultyLevel(v.lower())
-            except ValueError:
-                pass
-
-        raise ValueError(f"Cannot convert {v} to DifficultyLevel.")
-
-
-class Eval(BaseModel):
-    type: str
-    scoring: Optional[str]
-    template: Optional[str]
-    examples: Optional[str]
-
-    @validator("scoring", "template", always=True)
-    def validate_eval_fields(cls, v, values, field):
-        if "type" in values and values["type"] == "llm":
-            if v is None:
-                raise ValueError(f"{field.name} must be provided when type is 'llm'")
-        else:
-            if v is not None:
-                raise ValueError(f"{field.name} should only exist when type is 'llm'")
-        return v
-
-    @validator("scoring")
-    def validate_scoring(cls, v):
-        if v is not None and v not in ["percentage", "scale", "binary"]:
-            raise ValueError(
-                "scoring must be either 'percentage', 'scale', or 'binary'"
-            )
-        return v
-
-    @validator("template")
-    def validate_template(cls, v):
-        if v is not None and v not in ["rubric", "reference", "question", "custom"]:
-            raise ValueError(
-                "template must be either 'rubric', 'reference', 'question', or 'custom'"
-            )
-        return v
-
-
-class Ground(BaseModel):
-    answer: str
-    should_contain: Optional[List[str]] = None
-    should_not_contain: Optional[List[str]] = None
-    files: List[str]
-    case_sensitive: Optional[bool] = True
-    eval: Eval
-
-
 class Category(str, Enum):
    DATA = "data"
    GENERALIST = "general"
    CODING = "coding"
    SCRAPE_SYNTHESIZE = "scrape_synthesize"
+    WEB = "web"
    GAIA_1 = "GAIA_1"
    GAIA_2 = "GAIA_2"
    GAIA_3 = "GAIA_3"


-class ChallengeData(BaseModel):
-    eval_id: str = ""
-    name: str
-    category: List[Category]
-    task: str
-    dependencies: List[str]
-    cutoff: int
-    ground: Ground | Dict[str, Ground]
-    info: Info | Dict[str, Info]
-    metadata: Optional[Dict[str, Any]] = None
-
-    spec_file: Path | None = Field(None, exclude=True)
+class EvalResult(BaseModel):
+    result: str
+    result_source: Literal["step_output"] | str
+    score: float
+    passed: bool
--- a/benchmark/agbenchmark/utils/utils.py
+++ b/benchmark/agbenchmark/utils/utils.py
@@ -8,6 +8,7 @@ from typing import Any, Optional

 from dotenv import load_dotenv

+from agbenchmark.reports.processing.report_types import Test
 from agbenchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel

 load_dotenv()
@@ -63,41 +64,31 @@ def get_test_path(json_file: str | Path) -> str:


 def get_highest_success_difficulty(
-    data: dict, just_string: Optional[bool] = None
+    data: dict[str, Test], just_string: Optional[bool] = None
 ) -> str:
    highest_difficulty = None
    highest_difficulty_level = 0

    for test_name, test_data in data.items():
        try:
-            if test_data.get("tests", None):
-                highest_difficulty_str = test_data["metrics"]["highest_difficulty"]
+            if test_data.metrics.success:
+                difficulty_str = test_data.metrics.difficulty
+                if not difficulty_str:
+                    continue
+
                try:
-                    highest_difficulty = DifficultyLevel[highest_difficulty_str]
-                    highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty]
+                    difficulty_enum = DifficultyLevel[difficulty_str.lower()]
+                    difficulty_level = DIFFICULTY_MAP[difficulty_enum]
+
+                    if difficulty_level > highest_difficulty_level:
+                        highest_difficulty = difficulty_enum
+                        highest_difficulty_level = difficulty_level
                except KeyError:
                    logger.warning(
-                        f"Unexpected difficulty level '{highest_difficulty_str}' "
+                        f"Unexpected difficulty level '{difficulty_str}' "
                        f"in test '{test_name}'"
                    )
                    continue
-            else:
-                if test_data["metrics"]["success"]:
-                    difficulty_str = test_data["metrics"]["difficulty"]
-
-                    try:
-                        difficulty_enum = DifficultyLevel[difficulty_str.lower()]
-                        difficulty_level = DIFFICULTY_MAP[difficulty_enum]
-
-                        if difficulty_level > highest_difficulty_level:
-                            highest_difficulty = difficulty_enum
-                            highest_difficulty_level = difficulty_level
-                    except KeyError:
-                        logger.warning(
-                            f"Unexpected difficulty level '{difficulty_str}' "
-                            f"in test '{test_name}'"
-                        )
-                        continue
        except Exception as e:
            logger.warning(
                "An unexpected error [1] occurred while analyzing report [2]."