From f37981c3884264c50f2af93799f3667b5dc42cca Mon Sep 17 00:00:00 2001
From: scarletpan <hpanad@connect.ust.hk>
Date: Mon, 19 Jun 2023 12:39:34 +0000
Subject: [PATCH] init first challenge template

---
 agbenchmark/benchmark/challenges/Challenge.py | 11 ++++++
 .../benchmark/challenges/retrieval/r1_test.py | 29 +++++++++++++++
 data/README.md                                | 37 +++++++++++++++++++
 data/retrieval/r1_test_data_0.json            | 10 +++++
 data/retrieval/r1_test_data_1.json            | 10 +++++
 examples/basic_gpt_agent.py                   | 26 +++++++++++++
 6 files changed, 123 insertions(+)
 create mode 100644 data/README.md
 create mode 100644 data/retrieval/r1_test_data_0.json
 create mode 100644 data/retrieval/r1_test_data_1.json
 create mode 100644 examples/basic_gpt_agent.py

diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py
index e69de29b..bed522a8 100644
--- a/agbenchmark/benchmark/challenges/Challenge.py
+++ b/agbenchmark/benchmark/challenges/Challenge.py
@@ -0,0 +1,11 @@
+import json
+
+class Challenge(object):
+    def __init__(self, json_data):
+        self.json_data = json_data
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        with open(json_file) as f:
+            json_data = json.load(f)
+        return cls(json_data)
\ No newline at end of file
diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
index e69de29b..f300d094 100644
--- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py
+++ b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
@@ -0,0 +1,29 @@
+from ..Challenge import Challenge
+
+
+class RetrievelChallenge(Challenge):
+    """ Chanllenge for information-retrieval """
+    def __init__(self, json_data):
+        self.json_data = json_data
+        assert self.json_data["category"] == "information-retrieval"
+
+    @property
+    def agent_input(self):
+        return self.json_data["query"]
+
+    def scoring(self, content):
+        for should_contain_word in self.json_data["ground"]["should_contain"]:
+            if should_contain_word not in content:
+                return 0.
+        
+        for should_not_contain_word in self.json_data["ground"]["should_not_contain"]:
+            if should_not_contain_word in content:
+                return 0.
+        return 1.
+
+    def run(self, output_file):
+        output = open(output_file).read().strip()
+
+        score = self.scoring(output)
+
+        return score
\ No newline at end of file
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 00000000..d3e32b56
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,37 @@
+# Challenges Data Schema of Benchmark
+
+## General challenges
+Input:
+- **category** (str): information-retrieval
+- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
+
+
+
+## Information-retrieval challenges
+Input:
+- **category** (str): information-retrieval
+- **query** (str): the question need to be solve.
+- **ground** (dict): The ground truth.
+    - **answer** (str): The raw text of ground truth answer
+    - **should_contain** (list): the exact strings that is required in the final answer 
+    - **should_not_contain** (list): the exact strings that should not be in the final answer 
+- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
+
+Example:
+```python
+{
+    "category": "information-retrieval",
+    "query": "what is the capital of America",
+    "ground": {
+        "answer": "Washington",
+        "should_contain": ["Washington"],
+        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
+    },
+    "difficulty_level": "easy"
+}
+```
+
+
+
+Output:
+- **score** (float): scores range from [0, 1]
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json
new file mode 100644
index 00000000..a64f7e0c
--- /dev/null
+++ b/data/retrieval/r1_test_data_0.json
@@ -0,0 +1,10 @@
+{
+    "category": "information-retrieval",
+    "query": "what is the capital of America",
+    "ground": {
+        "answer": "Washington",
+        "should_contain": ["Washington"],
+        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
+    },
+    "difficulty_level": "easy"
+}
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json
new file mode 100644
index 00000000..73dec4cd
--- /dev/null
+++ b/data/retrieval/r1_test_data_1.json
@@ -0,0 +1,10 @@
+{
+    "category": "information-retrieval",
+    "query": "The Nobel Prize in Literature 2012",
+    "ground": {
+        "answer": "Mo Yan",
+        "should_contain": ["Mo Yan"],
+        "should_not_contain": ["Murakami Haruki"]
+    },
+    "difficulty_level": "easy"
+}
\ No newline at end of file
diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py
new file mode 100644
index 00000000..e2cc380c
--- /dev/null
+++ b/examples/basic_gpt_agent.py
@@ -0,0 +1,26 @@
+import json
+import openai
+from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge
+
+
+def basic_gpt_agent(challenge_file):
+    challenge = RetrievelChallenge.from_json_file(challenge_file)
+
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo-0613",
+        messages=[{"role": "user", "content": challenge.agent_input}])
+    answer = response["choices"][0]["message"]["content"]
+
+    output_file = "./basic_gpt_agent_retrieval_results.txt"
+    with open(output_file, "w") as f:
+        f.write(answer)
+
+    print("QUERY       : ", challenge.agent_input)
+    print("AGENT ANSWER: ", answer)
+
+    score = challenge.run(output_file)
+
+    print("AGENT SCORE : ", score)
+
+if __name__ == "__main__":
+    basic_gpt_agent("./data/retrieval/r1_test_data_1.json")