From 01b118e590f070bd88dec3c3aa02084e17f84fa2 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 26 Jul 2023 14:00:24 -0700
Subject: [PATCH] Add llm eval (#197)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .env.example                   |  3 ++-
 .gitmodules                    |  3 ++-
 agbenchmark/challenges         |  2 +-
 agbenchmark/utils/challenge.py | 27 ++++++++++++++++++++++++++-
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/.env.example b/.env.example
index 66764df6..eaeaaff3 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,4 @@
 AGENT_NAME=mini-agi
 REPORT_LOCATION="../../reports/mini-agi"
-MOCK_TEST=False
\ No newline at end of file
+MOCK_TEST=False
+OPENAI_API_KEY="sk-"
diff --git a/.gitmodules b/.gitmodules
index 54313e19..0e8d6ea1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,6 +26,7 @@
 	path = agent/beebot
 	url = https://github.com/merwanehamadi/beebot.git
 	branch = master
-[submodule "agbenchmark/challenges"]
+[submodule "agbenchmark/challenges"]
 	path = agbenchmark/challenges
 	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
+	branch = main
diff --git a/agbenchmark/challenges b/agbenchmark/challenges
index 59d44a6f..8153f05f 160000
--- a/agbenchmark/challenges
+++ b/agbenchmark/challenges
@@ -1 +1 @@
-Subproject commit 59d44a6f4f3cd778d9034857f7277428269b2028
+Subproject commit 8153f05f758d86ddf01fe7d8cb4fb7c53ddc4d72
diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py
index 02141b4e..5a8e1273 100644
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -5,6 +5,9 @@ import sys
 from abc import ABC
 from typing import Any, Dict, List
 
+import openai
+
+from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.utils.data_types import ChallengeData, Ground
 
 
@@ -117,7 +120,9 @@ class Challenge(ABC):
                 print_content = (
                     f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
                 )
-                if should_contain_word not in content:
+                if self.data.ground.type == "file_llm_evaluation":
+                    return self.llm_eval(content, should_contain_word)
+                elif should_contain_word not in content:
                     print(print_content, "False")
                     return 0.0
                 else:
@@ -134,6 +139,26 @@ class Challenge(ABC):
 
         return 1.0
 
+    def llm_eval(self, content: str, should_contain_word: str) -> float:
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if MOCK_FLAG:
+            return 1.0
+        evaluation_question = f"""
+QUESTION:
+{should_contain_word} Answer with 0 for no, 1 for yes.
+CONTENT:
+{content}
+ANSWER:
+
+"""
+        answer = openai.ChatCompletion.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": evaluation_question},
+            ],
+        )
+        return float(answer["choices"][0]["message"]["content"])
+
     def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
         scores = []
         scores_dict = {}