helicone and llm eval fixes

2026-02-16 19:54:25 +01:00 · 2023-07-27 14:07:46 +01:00
parent eb57b15380
commit 0e6be16d07
5 changed files with 45 additions and 53 deletions
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,3 +0,0 @@
-{
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-}
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -134,9 +134,6 @@ suite_reports: dict[str, list] = {}
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
    challenge_data = item.funcargs.get("challenge_data", None)

-    HeliconeLockManager.write_custom_property("challenge", challenge_data["name"])
-
-
    if not challenge_data:
        # this will only happen for dummy dependency setup tests
        return
@@ -158,14 +155,18 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv

    if call.when == "call":
+        test_name = ""
        # if it's a same task suite, we combine the report.
        # but not if it's a single --test
        if is_suite and is_suite.same_task and not flags:
+            test_name = is_suite.prefix
            generate_combined_suite_report(item, challenge_data, challenge_location)
        else:
            # single non suite test
+            test_name = challenge_data["name"]
            generate_single_call_report(item, call, challenge_data)
        # else: it's a same_task=false suite (tests aren't combined)
+        HeliconeLockManager.write_custom_property("challenge", test_name)
    if call.when == "teardown":
        finalize_reports(item, challenge_data)

--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -65,7 +65,6 @@ def generate_combined_suite_report(
                "success": False,
            },
        }
-
        if scores["scores_obj"][test_name] == 1:
            # add dependency successful here

--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -76,22 +76,14 @@ def start(
        )
        return 1

+    print("CONFIG_PATH", CONFIG_PATH)
+
    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
        config = {}

        config["workspace"] = click.prompt(
            "Please enter a new workspace path",
-            default=os.path.join(Path.home(), "workspace"),
-        )
-
-        config["entry_path"] = click.prompt(
-            "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
-            default="agbenchmark/benchmarks.py",
-        )
-
-        config["cutoff"] = click.prompt(
-            "Please enter a hard cutoff runtime for your agent per test",
-            default="60",
+            default=os.path.join("workspace"),
        )

        with open(CONFIG_PATH, "w") as f:
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -120,7 +120,7 @@ class Challenge(ABC):
                print_content = (
                    f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
                )
-                if self.data.ground.type == "file_llm_evaluation":
+                if ground.type == "file_llm_evaluation":
                    return self.llm_eval(content, should_contain_word)
                elif should_contain_word not in content:
                    print(print_content, "False")
@@ -164,46 +164,49 @@ ANSWER:
        scores_dict = {}
        percentage = None

-        if isinstance(self.data.ground, Ground):
-            files_contents = self.get_artifacts_out(
-                config["workspace"], self.data.ground
-            )
-
-            for file_content in files_contents:
-                score = self.scoring(file_content, self.data.ground)
-                print("\033[1;32mYour score is:\033[0m", score)
-                scores.append(score)
-        elif isinstance(self.data.ground, dict):
-            # if it's a dict then we know its a combined suite
-            for ground_key in self.data.ground:
-                ground = self.data.ground[ground_key]
-                files_contents = self.get_artifacts_out(config["workspace"], ground)
+        try:
+            if isinstance(self.data.ground, Ground):
+                files_contents = self.get_artifacts_out(
+                    config["workspace"], self.data.ground
+                )

                for file_content in files_contents:
-                    score = self.scoring(file_content, ground)
-                    scores_dict[ground_key] = score
-                    print(
-                        f"\033[1;35mScore for {ground_key}:\033[0m",
-                        scores_dict[ground_key],
-                    )
+                    score = self.scoring(file_content, self.data.ground)
+                    print("\033[1;32mYour score is:\033[0m", score)
+                    scores.append(score)
+            elif isinstance(self.data.ground, dict):
+                # if it's a dict then we know its a combined suite
+                for ground_key in self.data.ground:
+                    ground = self.data.ground[ground_key]
+                    files_contents = self.get_artifacts_out(config["workspace"], ground)

-            # Count the number of times the value 1.0 appears in the dictionary
-            num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
+                    for file_content in files_contents:
+                        score = self.scoring(file_content, ground)
+                        scores_dict[ground_key] = score
+                        print(
+                            f"\033[1;35mScore for {ground_key}:\033[0m",
+                            scores_dict[ground_key],
+                        )

-            # Calculate the percentage
-            percentage = round((num_ones / len(scores_dict)) * 100, 2)
+                # Count the number of times the value 1.0 appears in the dictionary
+                num_ones = sum(1 for score in scores_dict.values() if score == 1.0)

-            # Print the result in green
-            print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
+                # Calculate the percentage
+                percentage = round((num_ones / len(scores_dict)) * 100, 2)

-            # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
-            # So for now we return 1.0 if there's any that pass
-            if percentage > 0:
-                scores.append(1.0)
-                if percentage != 100:
-                    print(
-                        "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
-                    )
+                # Print the result in green
+                print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
+
+                # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
+                # So for now we return 1.0 if there's any that pass
+                if percentage > 0:
+                    scores.append(1.0)
+                    if percentage != 100:
+                        print(
+                            "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
+                        )
+        except Exception as e:
+            print("Error getting scores", e)

        scores_data = {
            "values": scores,