mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-01 13:24:22 +01:00
helicone and llm eval fixes
This commit is contained in:
@@ -1,3 +0,0 @@
|
||||
{
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
}
|
||||
@@ -134,9 +134,6 @@ suite_reports: dict[str, list] = {}
|
||||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
challenge_data = item.funcargs.get("challenge_data", None)
|
||||
|
||||
HeliconeLockManager.write_custom_property("challenge", challenge_data["name"])
|
||||
|
||||
|
||||
if not challenge_data:
|
||||
# this will only happen for dummy dependency setup tests
|
||||
return
|
||||
@@ -158,14 +155,18 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
|
||||
|
||||
if call.when == "call":
|
||||
test_name = ""
|
||||
# if it's a same task suite, we combine the report.
|
||||
# but not if it's a single --test
|
||||
if is_suite and is_suite.same_task and not flags:
|
||||
test_name = is_suite.prefix
|
||||
generate_combined_suite_report(item, challenge_data, challenge_location)
|
||||
else:
|
||||
# single non suite test
|
||||
test_name = challenge_data["name"]
|
||||
generate_single_call_report(item, call, challenge_data)
|
||||
# else: it's a same_task=false suite (tests aren't combined)
|
||||
HeliconeLockManager.write_custom_property("challenge", test_name)
|
||||
if call.when == "teardown":
|
||||
finalize_reports(item, challenge_data)
|
||||
|
||||
|
||||
@@ -65,7 +65,6 @@ def generate_combined_suite_report(
|
||||
"success": False,
|
||||
},
|
||||
}
|
||||
|
||||
if scores["scores_obj"][test_name] == 1:
|
||||
# add dependency successful here
|
||||
|
||||
|
||||
@@ -76,22 +76,14 @@ def start(
|
||||
)
|
||||
return 1
|
||||
|
||||
print("CONFIG_PATH", CONFIG_PATH)
|
||||
|
||||
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
|
||||
config = {}
|
||||
|
||||
config["workspace"] = click.prompt(
|
||||
"Please enter a new workspace path",
|
||||
default=os.path.join(Path.home(), "workspace"),
|
||||
)
|
||||
|
||||
config["entry_path"] = click.prompt(
|
||||
"Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
|
||||
default="agbenchmark/benchmarks.py",
|
||||
)
|
||||
|
||||
config["cutoff"] = click.prompt(
|
||||
"Please enter a hard cutoff runtime for your agent per test",
|
||||
default="60",
|
||||
default=os.path.join("workspace"),
|
||||
)
|
||||
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
|
||||
@@ -120,7 +120,7 @@ class Challenge(ABC):
|
||||
print_content = (
|
||||
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
|
||||
)
|
||||
if self.data.ground.type == "file_llm_evaluation":
|
||||
if ground.type == "file_llm_evaluation":
|
||||
return self.llm_eval(content, should_contain_word)
|
||||
elif should_contain_word not in content:
|
||||
print(print_content, "False")
|
||||
@@ -164,46 +164,49 @@ ANSWER:
|
||||
scores_dict = {}
|
||||
percentage = None
|
||||
|
||||
if isinstance(self.data.ground, Ground):
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground
|
||||
)
|
||||
|
||||
for file_content in files_contents:
|
||||
score = self.scoring(file_content, self.data.ground)
|
||||
print("\033[1;32mYour score is:\033[0m", score)
|
||||
scores.append(score)
|
||||
elif isinstance(self.data.ground, dict):
|
||||
# if it's a dict then we know its a combined suite
|
||||
for ground_key in self.data.ground:
|
||||
ground = self.data.ground[ground_key]
|
||||
files_contents = self.get_artifacts_out(config["workspace"], ground)
|
||||
try:
|
||||
if isinstance(self.data.ground, Ground):
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground
|
||||
)
|
||||
|
||||
for file_content in files_contents:
|
||||
score = self.scoring(file_content, ground)
|
||||
scores_dict[ground_key] = score
|
||||
print(
|
||||
f"\033[1;35mScore for {ground_key}:\033[0m",
|
||||
scores_dict[ground_key],
|
||||
)
|
||||
score = self.scoring(file_content, self.data.ground)
|
||||
print("\033[1;32mYour score is:\033[0m", score)
|
||||
scores.append(score)
|
||||
elif isinstance(self.data.ground, dict):
|
||||
# if it's a dict then we know its a combined suite
|
||||
for ground_key in self.data.ground:
|
||||
ground = self.data.ground[ground_key]
|
||||
files_contents = self.get_artifacts_out(config["workspace"], ground)
|
||||
|
||||
# Count the number of times the value 1.0 appears in the dictionary
|
||||
num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
|
||||
for file_content in files_contents:
|
||||
score = self.scoring(file_content, ground)
|
||||
scores_dict[ground_key] = score
|
||||
print(
|
||||
f"\033[1;35mScore for {ground_key}:\033[0m",
|
||||
scores_dict[ground_key],
|
||||
)
|
||||
|
||||
# Calculate the percentage
|
||||
percentage = round((num_ones / len(scores_dict)) * 100, 2)
|
||||
# Count the number of times the value 1.0 appears in the dictionary
|
||||
num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
|
||||
|
||||
# Print the result in green
|
||||
print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
|
||||
# Calculate the percentage
|
||||
percentage = round((num_ones / len(scores_dict)) * 100, 2)
|
||||
|
||||
# TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
|
||||
# So for now we return 1.0 if there's any that pass
|
||||
if percentage > 0:
|
||||
scores.append(1.0)
|
||||
if percentage != 100:
|
||||
print(
|
||||
"\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
|
||||
)
|
||||
# Print the result in green
|
||||
print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
|
||||
|
||||
# TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
|
||||
# So for now we return 1.0 if there's any that pass
|
||||
if percentage > 0:
|
||||
scores.append(1.0)
|
||||
if percentage != 100:
|
||||
print(
|
||||
"\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
|
||||
)
|
||||
except Exception as e:
|
||||
print("Error getting scores", e)
|
||||
|
||||
scores_data = {
|
||||
"values": scores,
|
||||
|
||||
Reference in New Issue
Block a user