diff --git a/agbenchmark/challenges b/agbenchmark/challenges index 4f3b149d..b1945bb0 160000 --- a/agbenchmark/challenges +++ b/agbenchmark/challenges @@ -1 +1 @@ -Subproject commit 4f3b149dcaee2c106fa1c47c7c6a912b6ac2aace +Subproject commit b1945bb0a95b6184bbbc0af1b260c1cde838eaac diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py index c442d52a..69dfa45b 100644 --- a/agbenchmark/generate_test.py +++ b/agbenchmark/generate_test.py @@ -134,8 +134,8 @@ def create_single_test( scores = self.get_scores(config) request.node.scores = scores # store scores in request.node - - assert 1 in scores["values"] + for score in scores["values"]: + assert score >= 1 # Parametrize the method here test_method = pytest.mark.parametrize( diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index eb9c7019..9a08cb0a 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -215,6 +215,8 @@ class Challenge(ABC): scores.append(math.ceil(llm_eval / 100)) elif self.data.ground.eval.scoring == "scale": scores.append(math.ceil(llm_eval / 10)) + print("\033[1;32mYour score is:\033[0m", llm_eval) + scores.append(llm_eval) elif isinstance(self.data.ground, dict): # if it's a dict then we know its a combined suite