helicone and llm eval fixes

This commit is contained in:
Silen Naihin
2023-07-27 14:07:46 +01:00
parent eb57b15380
commit 0e6be16d07
5 changed files with 45 additions and 53 deletions

View File

@@ -1,3 +0,0 @@
{
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}

View File

@@ -134,9 +134,6 @@ suite_reports: dict[str, list] = {}
def pytest_runtest_makereport(item: Any, call: Any) -> None:
challenge_data = item.funcargs.get("challenge_data", None)
HeliconeLockManager.write_custom_property("challenge", challenge_data["name"])
if not challenge_data:
# this will only happen for dummy dependency setup tests
return
@@ -158,14 +155,18 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
if call.when == "call":
test_name = ""
# if it's a same task suite, we combine the report.
# but not if it's a single --test
if is_suite and is_suite.same_task and not flags:
test_name = is_suite.prefix
generate_combined_suite_report(item, challenge_data, challenge_location)
else:
# single non suite test
test_name = challenge_data["name"]
generate_single_call_report(item, call, challenge_data)
# else: it's a same_task=false suite (tests aren't combined)
HeliconeLockManager.write_custom_property("challenge", test_name)
if call.when == "teardown":
finalize_reports(item, challenge_data)

View File

@@ -65,7 +65,6 @@ def generate_combined_suite_report(
"success": False,
},
}
if scores["scores_obj"][test_name] == 1:
# add dependency successful here

View File

@@ -76,22 +76,14 @@ def start(
)
return 1
print("CONFIG_PATH", CONFIG_PATH)
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
config = {}
config["workspace"] = click.prompt(
"Please enter a new workspace path",
default=os.path.join(Path.home(), "workspace"),
)
config["entry_path"] = click.prompt(
"Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
default="agbenchmark/benchmarks.py",
)
config["cutoff"] = click.prompt(
"Please enter a hard cutoff runtime for your agent per test",
default="60",
default=os.path.join("workspace"),
)
with open(CONFIG_PATH, "w") as f:

View File

@@ -120,7 +120,7 @@ class Challenge(ABC):
print_content = (
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
)
if self.data.ground.type == "file_llm_evaluation":
if ground.type == "file_llm_evaluation":
return self.llm_eval(content, should_contain_word)
elif should_contain_word not in content:
print(print_content, "False")
@@ -164,46 +164,49 @@ ANSWER:
scores_dict = {}
percentage = None
if isinstance(self.data.ground, Ground):
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground
)
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("\033[1;32mYour score is:\033[0m", score)
scores.append(score)
elif isinstance(self.data.ground, dict):
# if it's a dict then we know its a combined suite
for ground_key in self.data.ground:
ground = self.data.ground[ground_key]
files_contents = self.get_artifacts_out(config["workspace"], ground)
try:
if isinstance(self.data.ground, Ground):
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground
)
for file_content in files_contents:
score = self.scoring(file_content, ground)
scores_dict[ground_key] = score
print(
f"\033[1;35mScore for {ground_key}:\033[0m",
scores_dict[ground_key],
)
score = self.scoring(file_content, self.data.ground)
print("\033[1;32mYour score is:\033[0m", score)
scores.append(score)
elif isinstance(self.data.ground, dict):
# if it's a dict then we know its a combined suite
for ground_key in self.data.ground:
ground = self.data.ground[ground_key]
files_contents = self.get_artifacts_out(config["workspace"], ground)
# Count the number of times the value 1.0 appears in the dictionary
num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
for file_content in files_contents:
score = self.scoring(file_content, ground)
scores_dict[ground_key] = score
print(
f"\033[1;35mScore for {ground_key}:\033[0m",
scores_dict[ground_key],
)
# Calculate the percentage
percentage = round((num_ones / len(scores_dict)) * 100, 2)
# Count the number of times the value 1.0 appears in the dictionary
num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
# Print the result in green
print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
# Calculate the percentage
percentage = round((num_ones / len(scores_dict)) * 100, 2)
# TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
# So for now we return 1.0 if there's any that pass
if percentage > 0:
scores.append(1.0)
if percentage != 100:
print(
"\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
)
# Print the result in green
print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
# TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
# So for now we return 1.0 if there's any that pass
if percentage > 0:
scores.append(1.0)
if percentage != 100:
print(
"\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
)
except Exception as e:
print("Error getting scores", e)
scores_data = {
"values": scores,