Add llm eval (#197)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2026-02-09 08:14:27 +01:00 · 2023-07-26 14:00:24 -07:00
parent b778af156b
commit 01b118e590
4 changed files with 31 additions and 4 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,4 @@
 AGENT_NAME=mini-agi
 REPORT_LOCATION="../../reports/mini-agi"
-MOCK_TEST=False
+MOCK_TEST=False
+OPENAI_API_KEY="sk-"
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,6 +26,7 @@
 	path = agent/beebot
 	url = https://github.com/merwanehamadi/beebot.git
 	branch = master
-[submodule "agbenchmark/challenges"]
+[submodule "agbenchmark/challenges"]
 	path = agbenchmark/challenges
 	url = https://github.com/SilenNaihin/agbenchmark_challenges.git
+	branch = main
--- a/agbenchmark/challenges
+++ b/agbenchmark/challenges
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -5,6 +5,9 @@ import sys
 from abc import ABC
 from typing import Any, Dict, List

+import openai
+
+from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.utils.data_types import ChallengeData, Ground


@@ -117,7 +120,9 @@ class Challenge(ABC):
                print_content = (
                    f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
                )
-                if should_contain_word not in content:
+                if self.data.ground.type == "file_llm_evaluation":
+                    return self.llm_eval(content, should_contain_word)
+                elif should_contain_word not in content:
                    print(print_content, "False")
                    return 0.0
                else:
@@ -134,6 +139,26 @@ class Challenge(ABC):

        return 1.0

+    def llm_eval(self, content: str, should_contain_word: str) -> float:
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if MOCK_FLAG:
+            return 1.0
+        evaluation_question = f"""
+QUESTION:
+{should_contain_word} Answer with 0 for no, 1 for yes.
+CONTENT:
+{content}
+ANSWER:
+
+"""
+        answer = openai.ChatCompletion.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": evaluation_question},
+            ],
+        )
+        return float(answer["choices"][0]["message"]["content"])
+
    def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
        scores = []
        scores_dict = {}