From b82277515f17866d3a06ebf641450d4e6ef269b2 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 25 Jul 2023 19:07:24 +0100 Subject: [PATCH] hotfix reports (#191) --- agbenchmark/challenges/test_all.py | 9 +++--- agbenchmark/conftest.py | 14 ++++++--- agbenchmark/reports/utils.py | 6 ++-- agbenchmark/utils.py | 50 ++++++++++++++++-------------- agent/mini-agi | 2 +- 5 files changed, 46 insertions(+), 35 deletions(-) diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 1f85c4f4..646b0464 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -34,7 +34,7 @@ def create_single_test( # if its a parallel run suite we just give it the data if suite_config and suite_config.same_task: artifacts_location = str(Path(challenge_location).resolve()) - if "--test" or "--maintain" or "--improve" in sys.argv: + if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv: artifacts_location = str(Path(challenge_location).resolve().parent.parent) else: setattr( @@ -99,7 +99,7 @@ def create_challenge( grandparent_dir = path.parent.parent # if its a single test running we dont care about the suite - if "--test" or "--maintain" or "--improve" in sys.argv: + if "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv: create_single_suite_challenge(suite_config, data, path) return json_files @@ -191,8 +191,9 @@ def generate_tests() -> None: # sourcery skip: invert-any-all continue # --maintain and --improve flag - improve_flag = regression_tests.get(data["name"], None) - maintain_flag = not improve_flag + in_regression = regression_tests.get(data["name"], None) + improve_flag = in_regression and "--improve" in commands + maintain_flag = not in_regression and "--maintain" in commands if "--maintain" in commands and maintain_flag: continue elif "--improve" in commands and improve_flag: diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index c0f0baa4..d3f63c88 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -137,19 +137,25 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: return challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") - is_suite = None + # this is a non same task suite, with the location pointing to a data.json + is_suite = SuiteConfig.suite_data_if_suite( + Path(__file__).parent.parent / Path(challenge_location) + ) try: + # this is for a same_task suite pointing to the directory where the suite lives is_suite = SuiteConfig.deserialize( Path(__file__).parent.parent / Path(challenge_location) / "suite.json" ) - except: + except Exception as e: pass + flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv + if call.when == "call": # if it's a same task suite, we combine the report. # but not if it's a single --test - if is_suite and is_suite.same_task and "--test" not in sys.argv: + if is_suite and is_suite.same_task and not flags: generate_combined_suite_report(item, challenge_data, challenge_location) else: # single non suite test @@ -159,7 +165,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: finalize_reports(item, challenge_data) # for separate task suites (same_task=false), their data is the same as a regular suite, but we combined the report at the end - if is_suite and not is_suite.same_task: + if is_suite and not is_suite.same_task and not flags: suite_reports.setdefault(is_suite.prefix, []).append(challenge_data["name"]) diff --git a/agbenchmark/reports/utils.py b/agbenchmark/reports/utils.py index 9e7fb069..65d77238 100644 --- a/agbenchmark/reports/utils.py +++ b/agbenchmark/reports/utils.py @@ -45,8 +45,6 @@ def generate_combined_suite_report( ) item.test_name = suite_config.prefix - print("Generating combined suite report...", challenge_data, challenge_location) - data_paths = suite_config.get_data_paths(root_path / Path(challenge_location)) scores = getattr(item, "scores", {}) mock = "--mock" in sys.argv # Check if --mock is in sys.argv @@ -296,7 +294,9 @@ def generate_separate_suite_reports(suite_reports: dict) -> None: def session_finish(suite_reports: dict) -> None: - generate_separate_suite_reports(suite_reports) + flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv + if not flags: + generate_separate_suite_reports(suite_reports) with open(CONFIG_PATH, "r") as f: config = json.load(f) diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 36d3133e..13d88dfa 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -88,8 +88,8 @@ def calculate_info_test_path(reports_path: Path) -> str: print(f"Found {related_file_count} files with '{test_arg}' in the name") # Take the number from before the _ and add the .{number} - prefix = "" - math.floor(prefix_number) + prefix = 0 + prefix = math.floor(prefix_number) run_name = f"{prefix}.{related_file_count}_{test_arg}.json" @@ -148,32 +148,36 @@ def get_highest_success_difficulty( highest_difficulty_level = 0 for test_name, test_data in data.items(): - if test_data.get("tests", None): - highest_difficulty_str = test_data["metrics"]["highest_difficulty"] - try: - highest_difficulty = DifficultyLevel[highest_difficulty_str] - highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty] - except KeyError: - print( - f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'" - ) - continue - else: - if test_data["metrics"]["success"]: - difficulty_str = test_data["metrics"]["difficulty"] - + try: + if test_data.get("tests", None): + highest_difficulty_str = test_data["metrics"]["highest_difficulty"] try: - difficulty_enum = DifficultyLevel[difficulty_str.lower()] - difficulty_level = DIFFICULTY_MAP[difficulty_enum] - - if difficulty_level > highest_difficulty_level: - highest_difficulty = difficulty_enum - highest_difficulty_level = difficulty_level + highest_difficulty = DifficultyLevel[highest_difficulty_str] + highest_difficulty_level = DIFFICULTY_MAP[highest_difficulty] except KeyError: print( - f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'" + f"Unexpected difficulty level '{highest_difficulty_str}' in test '{test_name}'" ) continue + else: + if test_data["metrics"]["success"]: + difficulty_str = test_data["metrics"]["difficulty"] + + try: + difficulty_enum = DifficultyLevel[difficulty_str.lower()] + difficulty_level = DIFFICULTY_MAP[difficulty_enum] + + if difficulty_level > highest_difficulty_level: + highest_difficulty = difficulty_enum + highest_difficulty_level = difficulty_level + except KeyError: + print( + f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'" + ) + continue + except Exception: + print(f"Make sure you selected the right test, no reports were generated.") + break if highest_difficulty is not None: highest_difficulty_str = highest_difficulty.name # convert enum to string diff --git a/agent/mini-agi b/agent/mini-agi index 4a346ab7..3e83765f 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3 +Subproject commit 3e83765fa54d240c80d0f9578083d5b11fe34ce4