hotfix reports (#191)

2026-02-10 00:34:30 +01:00 · 2023-07-25 19:07:24 +01:00
parent 0c5051d37b
commit b82277515f
5 changed files with 46 additions and 35 deletions
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -34,7 +34,7 @@ def create_single_test(
    # if its a parallel run suite we just give it the data
    if suite_config and suite_config.same_task:
        artifacts_location = str(Path(challenge_location).resolve())
-        if "--test" or "--maintain" or "--improve" in sys.argv:
+        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
            artifacts_location = str(Path(challenge_location).resolve().parent.parent)
        else:
            setattr(
@@ -99,7 +99,7 @@ def create_challenge(
        grandparent_dir = path.parent.parent

        # if its a single test running we dont care about the suite
-        if "--test" or "--maintain" or "--improve" in sys.argv:
+        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
            create_single_suite_challenge(suite_config, data, path)
            return json_files

@@ -191,8 +191,9 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
            continue

        # --maintain and --improve flag
-        improve_flag = regression_tests.get(data["name"], None)
-        maintain_flag = not improve_flag
+        in_regression = regression_tests.get(data["name"], None)
+        improve_flag = in_regression and "--improve" in commands
+        maintain_flag = not in_regression and "--maintain" in commands
        if "--maintain" in commands and maintain_flag:
            continue
        elif "--improve" in commands and improve_flag:
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -137,19 +137,25 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
        return

    challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
-    is_suite = None
+    # this is a non same task suite, with the location pointing to a data.json
+    is_suite = SuiteConfig.suite_data_if_suite(
+        Path(__file__).parent.parent / Path(challenge_location)
+    )

    try:
+        # this is for a same_task suite pointing to the directory where the suite lives
        is_suite = SuiteConfig.deserialize(
            Path(__file__).parent.parent / Path(challenge_location) / "suite.json"
        )
-    except:
+    except Exception as e:
        pass

+    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+
    if call.when == "call":
        # if it's a same task suite, we combine the report.
        # but not if it's a single --test
-        if is_suite and is_suite.same_task and "--test" not in sys.argv:
+        if is_suite and is_suite.same_task and not flags:
            generate_combined_suite_report(item, challenge_data, challenge_location)
        else:
            # single non suite test
@@ -159,7 +165,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
        finalize_reports(item, challenge_data)

        # for separate task suites (same_task=false), their data is the same as a regular suite, but we combined the report at the end
-        if is_suite and not is_suite.same_task:
+        if is_suite and not is_suite.same_task and not flags:
            suite_reports.setdefault(is_suite.prefix, []).append(challenge_data["name"])


--- a/agbenchmark/reports/utils.py
+++ b/agbenchmark/reports/utils.py
@@ -45,8 +45,6 @@ def generate_combined_suite_report(
    )
    item.test_name = suite_config.prefix

-    print("Generating combined suite report...", challenge_data, challenge_location)
-
    data_paths = suite_config.get_data_paths(root_path / Path(challenge_location))
    scores = getattr(item, "scores", {})
    mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
@@ -296,7 +294,9 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:


 def session_finish(suite_reports: dict) -> None:
-    generate_separate_suite_reports(suite_reports)
+    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+    if not flags:
+        generate_separate_suite_reports(suite_reports)

    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -88,8 +88,8 @@ def calculate_info_test_path(reports_path: Path) -> str:
            print(f"Found {related_file_count} files with '{test_arg}' in the name")
            # Take the number from before the _ and add the .{number}

-            prefix = ""
-            math.floor(prefix_number)
+            prefix = 0
+            prefix = math.floor(prefix_number)

            run_name = f"{prefix}.{related_file_count}_{test_arg}.json"

@@ -148,32 +148,36 @@ def get_highest_success_difficulty(
    highest_difficulty_level = 0

    for test_name, test_data in data.items():
-        if test_data.get("tests", None):
-            highest_difficulty_str = test_data["metrics"]["highest_difficulty"]
-            try:
-                highest_difficulty = DifficultyLevel[highest_difficulty_str]
-                highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty]
-            except KeyError:
-                print(
-                    f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'"
-                )
-                continue
-        else:
-            if test_data["metrics"]["success"]:
-                difficulty_str = test_data["metrics"]["difficulty"]
-
+        try:
+            if test_data.get("tests", None):
+                highest_difficulty_str = test_data["metrics"]["highest_difficulty"]
                try:
-                    difficulty_enum = DifficultyLevel[difficulty_str.lower()]
-                    difficulty_level = DIFFICULTY_MAP[difficulty_enum]
-
-                    if difficulty_level > highest_difficulty_level:
-                        highest_difficulty = difficulty_enum
-                        highest_difficulty_level = difficulty_level
+                    highest_difficulty = DifficultyLevel[highest_difficulty_str]
+                    highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty]
                except KeyError:
                    print(
-                        f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
+                        f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'"
                    )
                    continue
+            else:
+                if test_data["metrics"]["success"]:
+                    difficulty_str = test_data["metrics"]["difficulty"]
+
+                    try:
+                        difficulty_enum = DifficultyLevel[difficulty_str.lower()]
+                        difficulty_level = DIFFICULTY_MAP[difficulty_enum]
+
+                        if difficulty_level > highest_difficulty_level:
+                            highest_difficulty = difficulty_enum
+                            highest_difficulty_level = difficulty_level
+                    except KeyError:
+                        print(
+                            f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
+                        )
+                        continue
+        except Exception:
+            print(f"Make sure you selected the right test, no reports were generated.")
+            break

    if highest_difficulty is not None:
        highest_difficulty_str = highest_difficulty.name  # convert enum to string
--- a/agent/mini-agi
+++ b/agent/mini-agi