From b82277515f17866d3a06ebf641450d4e6ef269b2 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 25 Jul 2023 19:07:24 +0100
Subject: [PATCH] hotfix reports (#191)

---
 agbenchmark/challenges/test_all.py |  9 +++---
 agbenchmark/conftest.py            | 14 ++++++---
 agbenchmark/reports/utils.py       |  6 ++--
 agbenchmark/utils.py               | 50 ++++++++++++++++--------------
 agent/mini-agi                     |  2 +-
 5 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 1f85c4f4..646b0464 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -34,7 +34,7 @@ def create_single_test(
     # if its a parallel run suite we just give it the data
     if suite_config and suite_config.same_task:
         artifacts_location = str(Path(challenge_location).resolve())
-        if "--test" or "--maintain" or "--improve" in sys.argv:
+        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
             artifacts_location = str(Path(challenge_location).resolve().parent.parent)
         else:
             setattr(
@@ -99,7 +99,7 @@ def create_challenge(
         grandparent_dir = path.parent.parent
 
         # if its a single test running we dont care about the suite
-        if "--test" or "--maintain" or "--improve" in sys.argv:
+        if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv:
             create_single_suite_challenge(suite_config, data, path)
             return json_files
 
@@ -191,8 +191,9 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
             continue
 
         # --maintain and --improve flag
-        improve_flag = regression_tests.get(data["name"], None)
-        maintain_flag = not improve_flag
+        in_regression = regression_tests.get(data["name"], None)
+        improve_flag = in_regression and "--improve" in commands
+        maintain_flag = not in_regression and "--maintain" in commands
         if "--maintain" in commands and maintain_flag:
             continue
         elif "--improve" in commands and improve_flag:
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index c0f0baa4..d3f63c88 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -137,19 +137,25 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         return
 
     challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
-    is_suite = None
+    # this is a non same task suite, with the location pointing to a data.json
+    is_suite = SuiteConfig.suite_data_if_suite(
+        Path(__file__).parent.parent / Path(challenge_location)
+    )
 
     try:
+        # this is for a same_task suite pointing to the directory where the suite lives
         is_suite = SuiteConfig.deserialize(
             Path(__file__).parent.parent / Path(challenge_location) / "suite.json"
         )
-    except:
+    except Exception as e:
         pass
 
+    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+
     if call.when == "call":
         # if it's a same task suite, we combine the report.
         # but not if it's a single --test
-        if is_suite and is_suite.same_task and "--test" not in sys.argv:
+        if is_suite and is_suite.same_task and not flags:
             generate_combined_suite_report(item, challenge_data, challenge_location)
         else:
             # single non suite test
@@ -159,7 +165,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         finalize_reports(item, challenge_data)
 
         # for separate task suites (same_task=false), their data is the same as a regular suite, but we combined the report at the end
-        if is_suite and not is_suite.same_task:
+        if is_suite and not is_suite.same_task and not flags:
             suite_reports.setdefault(is_suite.prefix, []).append(challenge_data["name"])
 
 
diff --git a/agbenchmark/reports/utils.py b/agbenchmark/reports/utils.py
index 9e7fb069..65d77238 100644
--- a/agbenchmark/reports/utils.py
+++ b/agbenchmark/reports/utils.py
@@ -45,8 +45,6 @@ def generate_combined_suite_report(
     )
     item.test_name = suite_config.prefix
 
-    print("Generating combined suite report...", challenge_data, challenge_location)
-
     data_paths = suite_config.get_data_paths(root_path / Path(challenge_location))
     scores = getattr(item, "scores", {})
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
@@ -296,7 +294,9 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
 
 
 def session_finish(suite_reports: dict) -> None:
-    generate_separate_suite_reports(suite_reports)
+    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+    if not flags:
+        generate_separate_suite_reports(suite_reports)
 
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 36d3133e..13d88dfa 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -88,8 +88,8 @@ def calculate_info_test_path(reports_path: Path) -> str:
             print(f"Found {related_file_count} files with '{test_arg}' in the name")
             # Take the number from before the _ and add the .{number}
 
-            prefix = ""
-            math.floor(prefix_number)
+            prefix = 0
+            prefix = math.floor(prefix_number)
 
             run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
 
@@ -148,32 +148,36 @@ def get_highest_success_difficulty(
     highest_difficulty_level = 0
 
     for test_name, test_data in data.items():
-        if test_data.get("tests", None):
-            highest_difficulty_str = test_data["metrics"]["highest_difficulty"]
-            try:
-                highest_difficulty = DifficultyLevel[highest_difficulty_str]
-                highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty]
-            except KeyError:
-                print(
-                    f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'"
-                )
-                continue
-        else:
-            if test_data["metrics"]["success"]:
-                difficulty_str = test_data["metrics"]["difficulty"]
-
+        try:
+            if test_data.get("tests", None):
+                highest_difficulty_str = test_data["metrics"]["highest_difficulty"]
                 try:
-                    difficulty_enum = DifficultyLevel[difficulty_str.lower()]
-                    difficulty_level = DIFFICULTY_MAP[difficulty_enum]
-
-                    if difficulty_level > highest_difficulty_level:
-                        highest_difficulty = difficulty_enum
-                        highest_difficulty_level = difficulty_level
+                    highest_difficulty = DifficultyLevel[highest_difficulty_str]
+                    highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty]
                 except KeyError:
                     print(
-                        f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
+                        f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'"
                     )
                     continue
+            else:
+                if test_data["metrics"]["success"]:
+                    difficulty_str = test_data["metrics"]["difficulty"]
+
+                    try:
+                        difficulty_enum = DifficultyLevel[difficulty_str.lower()]
+                        difficulty_level = DIFFICULTY_MAP[difficulty_enum]
+
+                        if difficulty_level > highest_difficulty_level:
+                            highest_difficulty = difficulty_enum
+                            highest_difficulty_level = difficulty_level
+                    except KeyError:
+                        print(
+                            f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
+                        )
+                        continue
+        except Exception:
+            print(f"Make sure you selected the right test, no reports were generated.")
+            break
 
     if highest_difficulty is not None:
         highest_difficulty_str = highest_difficulty.name  # convert enum to string
diff --git a/agent/mini-agi b/agent/mini-agi
index 4a346ab7..3e83765f 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3
+Subproject commit 3e83765fa54d240c80d0f9578083d5b11fe34ce4