Add llm eval (#197)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
This commit is contained in:
merwanehamadi
2023-07-26 14:00:24 -07:00
committed by GitHub
parent b778af156b
commit 01b118e590
4 changed files with 31 additions and 4 deletions

View File

@@ -1,3 +1,4 @@
AGENT_NAME=mini-agi
REPORT_LOCATION="../../reports/mini-agi"
MOCK_TEST=False
MOCK_TEST=False
OPENAI_API_KEY="sk-"

3
.gitmodules vendored
View File

@@ -26,6 +26,7 @@
path = agent/beebot
url = https://github.com/merwanehamadi/beebot.git
branch = master
[submodule "agbenchmark/challenges"]
[submodule "agbenchmark/challenges"]
path = agbenchmark/challenges
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
branch = main

View File

@@ -5,6 +5,9 @@ import sys
from abc import ABC
from typing import Any, Dict, List
import openai
from agbenchmark.agent_interface import MOCK_FLAG
from agbenchmark.utils.data_types import ChallengeData, Ground
@@ -117,7 +120,9 @@ class Challenge(ABC):
print_content = (
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
)
if should_contain_word not in content:
if self.data.ground.type == "file_llm_evaluation":
return self.llm_eval(content, should_contain_word)
elif should_contain_word not in content:
print(print_content, "False")
return 0.0
else:
@@ -134,6 +139,26 @@ class Challenge(ABC):
return 1.0
def llm_eval(self, content: str, should_contain_word: str) -> float:
openai.api_key = os.getenv("OPENAI_API_KEY")
if MOCK_FLAG:
return 1.0
evaluation_question = f"""
QUESTION:
{should_contain_word} Answer with 0 for no, 1 for yes.
CONTENT:
{content}
ANSWER:
"""
answer = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": evaluation_question},
],
)
return float(answer["choices"][0]["message"]["content"])
def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
scores = []
scores_dict = {}