mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-26 10:24:30 +01:00
Add llm eval (#197)
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
AGENT_NAME=mini-agi
|
||||
REPORT_LOCATION="../../reports/mini-agi"
|
||||
MOCK_TEST=False
|
||||
MOCK_TEST=False
|
||||
OPENAI_API_KEY="sk-"
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -26,6 +26,7 @@
|
||||
path = agent/beebot
|
||||
url = https://github.com/merwanehamadi/beebot.git
|
||||
branch = master
|
||||
[submodule "agbenchmark/challenges"]
|
||||
[submodule "agbenchmark/challenges"]
|
||||
path = agbenchmark/challenges
|
||||
url = https://github.com/SilenNaihin/agbenchmark_challenges.git
|
||||
branch = main
|
||||
|
||||
Submodule agbenchmark/challenges updated: 59d44a6f4f...8153f05f75
@@ -5,6 +5,9 @@ import sys
|
||||
from abc import ABC
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import openai
|
||||
|
||||
from agbenchmark.agent_interface import MOCK_FLAG
|
||||
from agbenchmark.utils.data_types import ChallengeData, Ground
|
||||
|
||||
|
||||
@@ -117,7 +120,9 @@ class Challenge(ABC):
|
||||
print_content = (
|
||||
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
|
||||
)
|
||||
if should_contain_word not in content:
|
||||
if self.data.ground.type == "file_llm_evaluation":
|
||||
return self.llm_eval(content, should_contain_word)
|
||||
elif should_contain_word not in content:
|
||||
print(print_content, "False")
|
||||
return 0.0
|
||||
else:
|
||||
@@ -134,6 +139,26 @@ class Challenge(ABC):
|
||||
|
||||
return 1.0
|
||||
|
||||
def llm_eval(self, content: str, should_contain_word: str) -> float:
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
if MOCK_FLAG:
|
||||
return 1.0
|
||||
evaluation_question = f"""
|
||||
QUESTION:
|
||||
{should_contain_word} Answer with 0 for no, 1 for yes.
|
||||
CONTENT:
|
||||
{content}
|
||||
ANSWER:
|
||||
|
||||
"""
|
||||
answer = openai.ChatCompletion.create(
|
||||
model="gpt-4",
|
||||
messages=[
|
||||
{"role": "system", "content": evaluation_question},
|
||||
],
|
||||
)
|
||||
return float(answer["choices"][0]["message"]["content"])
|
||||
|
||||
def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
|
||||
scores = []
|
||||
scores_dict = {}
|
||||
|
||||
Reference in New Issue
Block a user