diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py index e2fb1bc6..d2fc8dea 100644 --- a/benchmark/agbenchmark/reports/processing/report_types.py +++ b/benchmark/agbenchmark/reports/processing/report_types.py @@ -51,6 +51,8 @@ class Test(BaseModelBenchmark): category: List[str] task: str reached_cutoff: bool + metadata: Any + class ReportBase(BaseModelBenchmark): @@ -68,6 +70,7 @@ class Report(ReportBase): tests: Dict[str, Test] + class ReportV2(Test, ReportBase): test_name: str run_id: str | None diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py index 74b50932..955b1d6a 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/agbenchmark/utils/data_types.py @@ -174,6 +174,9 @@ class Category(str, Enum): GENERALIST = "general" CODING = "coding" SCRAPE_SYNTHESIZE = "scrape_synthesize" + GAIA_1 = "GAIA_1" + GAIA_2 = "GAIA_2" + GAIA_3 = "GAIA_3" class ChallengeData(BaseModel):