Ability to run by categories (#5229)

* Ability to run by categories

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>

* always use Path.cwd()

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>

---------

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
This commit is contained in:
merwanehamadi
2023-09-15 20:04:12 -07:00
committed by GitHub
parent 688cd52be2
commit 295702867a
24 changed files with 200 additions and 476 deletions

View File

@@ -10,8 +10,11 @@ from typing import Any, Dict, Optional
import pytest
from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN, UPDATES_JSON_PATH
from agbenchmark.agent_api_interface import append_updates_file
from agbenchmark.utils.challenge import Challenge
from agbenchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData
from agent_protocol_client.models.step import Step
DATA_CATEGORY = {}
@@ -48,7 +51,7 @@ def create_single_test(
test_name = self.data.name
try:
with open("challenges_already_beaten.json", "r") as f:
with open(CHALLENGES_ALREADY_BEATEN, "r") as f:
challenges_beaten_in_the_past = json.load(f)
except:
challenges_beaten_in_the_past = {}
@@ -82,7 +85,24 @@ def create_single_test(
)
del scores["answers"] # remove answers from scores
request.node.scores = scores # store scores in request.node
assert 1 in scores["values"]
is_score_100 = 1 in scores["values"]
evaluation = "Correct!" if is_score_100 else "Incorrect."
eval_step = Step(
input=evaluation,
additional_input=None,
task_id="irrelevant, this step is a hack",
step_id="irrelevant, this step is a hack",
name="",
status="created",
output=None,
additional_output=None,
artifacts=[],
is_last=True,
)
await append_updates_file(eval_step)
assert is_score_100
# Parametrize the method here
test_method = pytest.mark.parametrize(
@@ -194,4 +214,18 @@ def challenge_should_be_ignored(json_file):
return "challenges/deprecated" in json_file or "challenges/library" in json_file
def initialize_updates_file():
if os.path.exists(UPDATES_JSON_PATH):
# If the file already exists, overwrite it with an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Initialized updates.json by overwriting with an empty array")
else:
# If the file doesn't exist, create it and write an empty list
with open(UPDATES_JSON_PATH, "w") as file:
json.dump([], file, indent=2)
print("Created updates.json and initialized it with an empty array")
initialize_updates_file()
generate_tests()