From 01b118e590f070bd88dec3c3aa02084e17f84fa2 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 26 Jul 2023 14:00:24 -0700 Subject: [PATCH] Add llm eval (#197) Signed-off-by: Merwane Hamadi --- .env.example | 3 ++- .gitmodules | 3 ++- agbenchmark/challenges | 2 +- agbenchmark/utils/challenge.py | 27 ++++++++++++++++++++++++++- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 66764df6..eaeaaff3 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,4 @@ AGENT_NAME=mini-agi REPORT_LOCATION="../../reports/mini-agi" -MOCK_TEST=False \ No newline at end of file +MOCK_TEST=False +OPENAI_API_KEY="sk-" diff --git a/.gitmodules b/.gitmodules index 54313e19..0e8d6ea1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -26,6 +26,7 @@ path = agent/beebot url = https://github.com/merwanehamadi/beebot.git branch = master -[submodule "agbenchmark/challenges"] +[submodule "agbenchmark/challenges"] path = agbenchmark/challenges url = https://github.com/SilenNaihin/agbenchmark_challenges.git + branch = main diff --git a/agbenchmark/challenges b/agbenchmark/challenges index 59d44a6f..8153f05f 160000 --- a/agbenchmark/challenges +++ b/agbenchmark/challenges @@ -1 +1 @@ -Subproject commit 59d44a6f4f3cd778d9034857f7277428269b2028 +Subproject commit 8153f05f758d86ddf01fe7d8cb4fb7c53ddc4d72 diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index 02141b4e..5a8e1273 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -5,6 +5,9 @@ import sys from abc import ABC from typing import Any, Dict, List +import openai + +from agbenchmark.agent_interface import MOCK_FLAG from agbenchmark.utils.data_types import ChallengeData, Ground @@ -117,7 +120,9 @@ class Challenge(ABC): print_content = ( f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:" ) - if should_contain_word not in content: + if self.data.ground.type == "file_llm_evaluation": + return self.llm_eval(content, should_contain_word) + elif should_contain_word not in content: print(print_content, "False") return 0.0 else: @@ -134,6 +139,26 @@ class Challenge(ABC): return 1.0 + def llm_eval(self, content: str, should_contain_word: str) -> float: + openai.api_key = os.getenv("OPENAI_API_KEY") + if MOCK_FLAG: + return 1.0 + evaluation_question = f""" +QUESTION: +{should_contain_word} Answer with 0 for no, 1 for yes. +CONTENT: +{content} +ANSWER: + +""" + answer = openai.ChatCompletion.create( + model="gpt-4", + messages=[ + {"role": "system", "content": evaluation_question}, + ], + ) + return float(answer["choices"][0]["message"]["content"]) + def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]: scores = [] scores_dict = {}