diff --git a/agbenchmark/config.json b/agbenchmark/config.json deleted file mode 100644 index 3a03b741..00000000 --- a/agbenchmark/config.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "workspace": "${os.path.join(Path.home(), 'miniagi')}" -} diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index f865a3d8..0a239201 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -134,9 +134,6 @@ suite_reports: dict[str, list] = {} def pytest_runtest_makereport(item: Any, call: Any) -> None: challenge_data = item.funcargs.get("challenge_data", None) - HeliconeLockManager.write_custom_property("challenge", challenge_data["name"]) - - if not challenge_data: # this will only happen for dummy dependency setup tests return @@ -158,14 +155,18 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv if call.when == "call": + test_name = "" # if it's a same task suite, we combine the report. # but not if it's a single --test if is_suite and is_suite.same_task and not flags: + test_name = is_suite.prefix generate_combined_suite_report(item, challenge_data, challenge_location) else: # single non suite test + test_name = challenge_data["name"] generate_single_call_report(item, call, challenge_data) # else: it's a same_task=false suite (tests aren't combined) + HeliconeLockManager.write_custom_property("challenge", test_name) if call.when == "teardown": finalize_reports(item, challenge_data) diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py index 17ddd399..e051c444 100644 --- a/agbenchmark/reports/reports.py +++ b/agbenchmark/reports/reports.py @@ -65,7 +65,6 @@ def generate_combined_suite_report( "success": False, }, } - if scores["scores_obj"][test_name] == 1: # add dependency successful here diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 0ca906be..c5078408 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -76,22 +76,14 @@ def start( ) return 1 + print("CONFIG_PATH", CONFIG_PATH) + if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} config["workspace"] = click.prompt( "Please enter a new workspace path", - default=os.path.join(Path.home(), "workspace"), - ) - - config["entry_path"] = click.prompt( - "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder", - default="agbenchmark/benchmarks.py", - ) - - config["cutoff"] = click.prompt( - "Please enter a hard cutoff runtime for your agent per test", - default="60", + default=os.path.join("workspace"), ) with open(CONFIG_PATH, "w") as f: diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index 5a8e1273..d23d3ec2 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -120,7 +120,7 @@ class Challenge(ABC): print_content = ( f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:" ) - if self.data.ground.type == "file_llm_evaluation": + if ground.type == "file_llm_evaluation": return self.llm_eval(content, should_contain_word) elif should_contain_word not in content: print(print_content, "False") @@ -164,46 +164,49 @@ ANSWER: scores_dict = {} percentage = None - if isinstance(self.data.ground, Ground): - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground - ) - - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("\033[1;32mYour score is:\033[0m", score) - scores.append(score) - elif isinstance(self.data.ground, dict): - # if it's a dict then we know its a combined suite - for ground_key in self.data.ground: - ground = self.data.ground[ground_key] - files_contents = self.get_artifacts_out(config["workspace"], ground) + try: + if isinstance(self.data.ground, Ground): + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground + ) for file_content in files_contents: - score = self.scoring(file_content, ground) - scores_dict[ground_key] = score - print( - f"\033[1;35mScore for {ground_key}:\033[0m", - scores_dict[ground_key], - ) + score = self.scoring(file_content, self.data.ground) + print("\033[1;32mYour score is:\033[0m", score) + scores.append(score) + elif isinstance(self.data.ground, dict): + # if it's a dict then we know its a combined suite + for ground_key in self.data.ground: + ground = self.data.ground[ground_key] + files_contents = self.get_artifacts_out(config["workspace"], ground) - # Count the number of times the value 1.0 appears in the dictionary - num_ones = sum(1 for score in scores_dict.values() if score == 1.0) + for file_content in files_contents: + score = self.scoring(file_content, ground) + scores_dict[ground_key] = score + print( + f"\033[1;35mScore for {ground_key}:\033[0m", + scores_dict[ground_key], + ) - # Calculate the percentage - percentage = round((num_ones / len(scores_dict)) * 100, 2) + # Count the number of times the value 1.0 appears in the dictionary + num_ones = sum(1 for score in scores_dict.values() if score == 1.0) - # Print the result in green - print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%") + # Calculate the percentage + percentage = round((num_ones / len(scores_dict)) * 100, 2) - # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break. - # So for now we return 1.0 if there's any that pass - if percentage > 0: - scores.append(1.0) - if percentage != 100: - print( - "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite." - ) + # Print the result in green + print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%") + + # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break. + # So for now we return 1.0 if there's any that pass + if percentage > 0: + scores.append(1.0) + if percentage != 100: + print( + "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite." + ) + except Exception as e: + print("Error getting scores", e) scores_data = { "values": scores,