diff --git a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json index d4967c1b..dfbcca14 100644 --- a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json +++ b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json @@ -9,6 +9,7 @@ "eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac", "ground": { "answer": "The csv labelled", + "case_sensitive": true, "eval": { "type": "file" }, diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json index 703636bc..f198d2e4 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json @@ -10,6 +10,7 @@ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "The twitter handles of the two hosts of Latent Space.", + "case_sensitive": false, "eval": { "type": "file" }, diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json index c3c069f2..3a696235 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json @@ -10,6 +10,7 @@ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "Toran is from Scotland https://uk.linkedin.com/in/toran-richards.", + "case_sensitive": true, "eval": { "type": "file" }, @@ -17,7 +18,7 @@ "output.txt" ], "should_contain": [ - "cotland" + "Scotland" ], "should_not_contain": [] }, diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json index bb733702..4e132762 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json @@ -11,6 +11,7 @@ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "https://www.amazon.com/gp/bestsellers/2021/books, second book, the answer is Reed", + "case_sensitive": true, "eval": { "type": "file" }, diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json index 48db48ce..a4fe85c6 100644 --- a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json +++ b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json @@ -11,6 +11,7 @@ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5", "ground": { "answer": "Get to https://www.forbes.com/special-report/2012/30-under-30/30-under-30_games.html, then https://www.linkedin.com/in/brianjcho/details/experience/ is the first 27 year old, then find his longest working job on Linkedin which is Riot Games.", + "case_sensitive": true, "eval": { "type": "file" }, diff --git a/benchmark/agbenchmark/utils/challenge.py b/benchmark/agbenchmark/utils/challenge.py index 36bf0af2..20353f68 100644 --- a/benchmark/agbenchmark/utils/challenge.py +++ b/benchmark/agbenchmark/utils/challenge.py @@ -123,6 +123,9 @@ class Challenge(ABC): print("\033[1;34mScoring content:\033[0m", content) if ground.should_contain: for should_contain_word in ground.should_contain: + if not getattr(ground, 'case_sensitive', True): + should_contain_word = should_contain_word.lower() + content = content.lower() print_content = ( f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:" ) @@ -134,6 +137,9 @@ class Challenge(ABC): if ground.should_not_contain: for should_not_contain_word in ground.should_not_contain: + if not getattr(ground, 'case_sensitive', True): + should_not_contain_word = should_not_contain_word.lower() + content = content.lower() print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:" if should_not_contain_word in content: print(print_content, "False") diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py index 8f1e53d0..74b50932 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/agbenchmark/utils/data_types.py @@ -165,6 +165,7 @@ class Ground(BaseModel): should_contain: Optional[List[str]] = None should_not_contain: Optional[List[str]] = None files: List[str] + case_sensitive: Optional[bool] = True eval: Eval