From f37981c3884264c50f2af93799f3667b5dc42cca Mon Sep 17 00:00:00 2001 From: scarletpan Date: Mon, 19 Jun 2023 12:39:34 +0000 Subject: [PATCH] init first challenge template --- agbenchmark/benchmark/challenges/Challenge.py | 11 ++++++ .../benchmark/challenges/retrieval/r1_test.py | 29 +++++++++++++++ data/README.md | 37 +++++++++++++++++++ data/retrieval/r1_test_data_0.json | 10 +++++ data/retrieval/r1_test_data_1.json | 10 +++++ examples/basic_gpt_agent.py | 26 +++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 data/README.md create mode 100644 data/retrieval/r1_test_data_0.json create mode 100644 data/retrieval/r1_test_data_1.json create mode 100644 examples/basic_gpt_agent.py diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py index e69de29b..bed522a8 100644 --- a/agbenchmark/benchmark/challenges/Challenge.py +++ b/agbenchmark/benchmark/challenges/Challenge.py @@ -0,0 +1,11 @@ +import json + +class Challenge(object): + def __init__(self, json_data): + self.json_data = json_data + + @classmethod + def from_json_file(cls, json_file): + with open(json_file) as f: + json_data = json.load(f) + return cls(json_data) \ No newline at end of file diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py index e69de29b..f300d094 100644 --- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py +++ b/agbenchmark/benchmark/challenges/retrieval/r1_test.py @@ -0,0 +1,29 @@ +from ..Challenge import Challenge + + +class RetrievelChallenge(Challenge): + """ Chanllenge for information-retrieval """ + def __init__(self, json_data): + self.json_data = json_data + assert self.json_data["category"] == "information-retrieval" + + @property + def agent_input(self): + return self.json_data["query"] + + def scoring(self, content): + for should_contain_word in self.json_data["ground"]["should_contain"]: + if should_contain_word not in content: + return 0. + + for should_not_contain_word in self.json_data["ground"]["should_not_contain"]: + if should_not_contain_word in content: + return 0. + return 1. + + def run(self, output_file): + output = open(output_file).read().strip() + + score = self.scoring(output) + + return score \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 00000000..d3e32b56 --- /dev/null +++ b/data/README.md @@ -0,0 +1,37 @@ +# Challenges Data Schema of Benchmark + +## General challenges +Input: +- **category** (str): information-retrieval +- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] + + + +## Information-retrieval challenges +Input: +- **category** (str): information-retrieval +- **query** (str): the question need to be solve. +- **ground** (dict): The ground truth. + - **answer** (str): The raw text of ground truth answer + - **should_contain** (list): the exact strings that is required in the final answer + - **should_not_contain** (list): the exact strings that should not be in the final answer +- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] + +Example: +```python +{ + "category": "information-retrieval", + "query": "what is the capital of America", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"] + }, + "difficulty_level": "easy" +} +``` + + + +Output: +- **score** (float): scores range from [0, 1] \ No newline at end of file diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json new file mode 100644 index 00000000..a64f7e0c --- /dev/null +++ b/data/retrieval/r1_test_data_0.json @@ -0,0 +1,10 @@ +{ + "category": "information-retrieval", + "query": "what is the capital of America", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"] + }, + "difficulty_level": "easy" +} \ No newline at end of file diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json new file mode 100644 index 00000000..73dec4cd --- /dev/null +++ b/data/retrieval/r1_test_data_1.json @@ -0,0 +1,10 @@ +{ + "category": "information-retrieval", + "query": "The Nobel Prize in Literature 2012", + "ground": { + "answer": "Mo Yan", + "should_contain": ["Mo Yan"], + "should_not_contain": ["Murakami Haruki"] + }, + "difficulty_level": "easy" +} \ No newline at end of file diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py new file mode 100644 index 00000000..e2cc380c --- /dev/null +++ b/examples/basic_gpt_agent.py @@ -0,0 +1,26 @@ +import json +import openai +from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge + + +def basic_gpt_agent(challenge_file): + challenge = RetrievelChallenge.from_json_file(challenge_file) + + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + messages=[{"role": "user", "content": challenge.agent_input}]) + answer = response["choices"][0]["message"]["content"] + + output_file = "./basic_gpt_agent_retrieval_results.txt" + with open(output_file, "w") as f: + f.write(answer) + + print("QUERY : ", challenge.agent_input) + print("AGENT ANSWER: ", answer) + + score = challenge.run(output_file) + + print("AGENT SCORE : ", score) + +if __name__ == "__main__": + basic_gpt_agent("./data/retrieval/r1_test_data_1.json")