Benchmark changes

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2025-12-18 06:24:20 +01:00 · 2023-09-12 12:10:03 -07:00
parent 978a980d72
commit 1b14d304d4
281 changed files with 428 additions and 718 deletions
--- a/benchmark/agbenchmark/utils/challenge.py
+++ b/benchmark/agbenchmark/utils/challenge.py
@@ -0,0 +1,272 @@
+import glob
+import math
+import os
+import subprocess
+import sys
+from abc import ABC
+from pathlib import Path
+from typing import Any, Dict, List
+
+import openai
+import pytest
+
+from agbenchmark.__main__ import OPTIONAL_CATEGORIES
+from agbenchmark.agent_api_interface import run_api_agent
+from agbenchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, Ground
+from agbenchmark.utils.prompts import (
+    END_PROMPT,
+    FEW_SHOT_EXAMPLES,
+    PROMPT_MAP,
+    SCORING_MAP,
+)
+from agbenchmark.utils.utils import agent_eligibible_for_optional_categories
+
+
+class Challenge(ABC):
+    """The parent class to all specific challenges classes.
+    Defines helper methods for running a challenge"""
+
+    _data_cache: Dict[str, ChallengeData] = {}
+    CHALLENGE_LOCATION: str = ""
+    scores: dict[str, Any] = {}  # this is for suites
+
+    @property
+    def data(self) -> ChallengeData:
+        if self.CHALLENGE_LOCATION not in self._data_cache:
+            self._data_cache[self.CHALLENGE_LOCATION] = ChallengeData.deserialize(
+                self.CHALLENGE_LOCATION
+            )
+        return self._data_cache[self.CHALLENGE_LOCATION]
+
+    @property
+    def task(self) -> str:
+        return self.data.task
+
+    @property
+    def dependencies(self) -> list:
+        return self.data.dependencies
+
+    async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
+        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
+
+        artifact_paths = [
+            self.ARTIFACTS_LOCATION,
+            str(Path(self.CHALLENGE_LOCATION).parent),
+        ]
+
+        for path in artifact_paths:
+            copy_artifacts_into_workspace(config["workspace"], "artifacts_in", path)
+
+        if not self.task:
+            return
+
+        print(
+            f"\033[1;35m============Starting {self.data.name} challenge============\033[0m"
+        )
+        print(f"\033[1;30mTask: {self.task}\033[0m")
+
+        if "--api_mode" in sys.argv:
+            await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff)
+        elif "--mock" in sys.argv:
+            print("Running mock agent")
+            for path in artifact_paths:
+                copy_artifacts_into_workspace(
+                    config["workspace"], "artifacts_out", path
+                )
+        else:
+            agent_benchmark_config: AgentBenchmarkConfig = config[
+                "AgentBenchmarkConfig"
+            ]
+            run_agent(self.task, cutoff, agent_config=agent_benchmark_config)
+
+        # hidden files are added after the agent runs. Hidden files can be python test files.
+        # We copy them in the workspace to make it easy to import the code produced by the agent
+
+        for path in artifact_paths:
+            copy_artifacts_into_workspace(config["workspace"], "custom_python", path)
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def open_file(workspace: str, filename: str) -> str:
+        script_dir = workspace
+        workspace_dir = os.path.join(script_dir, filename)
+        with open(workspace_dir, "r") as f:
+            return f.read()
+
+    def get_artifacts_out(
+        self, workspace: str | dict[str, str], ground: Ground
+    ) -> List[str]:
+        if isinstance(workspace, dict):
+            workspace = workspace["output"]
+
+        script_dir = workspace
+        files_contents = []
+
+        for file_pattern in ground.files:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                if ground.eval.type == "python":
+                    result = subprocess.run(
+                        [sys.executable, file_path],
+                        cwd=os.path.abspath(workspace),
+                        capture_output=True,
+                        text=True,
+                    )
+                    if "error" in result.stderr or result.returncode != 0:
+                        print(result.stderr)
+                        assert False, result.stderr
+                    files_contents.append(f"Output: {result.stdout}\n")
+                else:
+                    with open(file_path, "r") as f:
+                        files_contents.append(f.read())
+        else:
+            if ground.eval.type == "pytest":
+                result = subprocess.run(
+                    [sys.executable, "-m", "pytest"],
+                    cwd=os.path.abspath(workspace),
+                    capture_output=True,
+                    text=True,
+                )
+                if "error" in result.stderr or result.returncode != 0:
+                    print(result.stderr)
+                    assert False, result.stderr
+                files_contents.append(f"Output: {result.stdout}\n")
+
+        return files_contents
+
+    @staticmethod
+    def write_to_file(workspace: str, filename: str, content: str) -> None:
+        script_dir = workspace
+        print("Writing file at", script_dir)
+        workspace_dir = os.path.join(script_dir, filename)
+
+        # Open the file in write mode.
+        with open(workspace_dir, "w") as f:
+            # Write the content to the file.
+            f.write(content)
+
+    def get_filenames_in_workspace(self, workspace: str) -> List[str]:
+        return [
+            filename
+            for filename in os.listdir(workspace)
+            if os.path.isfile(os.path.join(workspace, filename))
+        ]
+
+    def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
+        print("\033[1;34mScoring content:\033[0m", content)
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                print_content = (
+                    f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
+                )
+                if should_contain_word not in content:
+                    print(print_content, "False")
+                    return 0.0
+                else:
+                    print(print_content, "True")
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:"
+                if should_not_contain_word in content:
+                    print(print_content, "False")
+                    return 0.0
+                else:
+                    print(print_content, "True")
+
+        return 1.0
+
+    def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if "--mock" in sys.argv:
+            return 1.0
+
+        # the validation for this is done in the Eval BaseModel
+        scoring = SCORING_MAP[ground.eval.scoring]  # type: ignore
+        prompt = PROMPT_MAP[ground.eval.template].format(task=self.data.task, scoring=scoring, answer=ground.answer, response=content)  # type: ignore
+
+        if ground.eval.examples:
+            prompt += FEW_SHOT_EXAMPLES.format(examples=ground.eval.examples)
+
+        prompt += END_PROMPT
+
+        answer = openai.ChatCompletion.create(
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": prompt},
+            ],
+        )
+
+        return float(answer["choices"][0]["message"]["content"])  # type: ignore
+
+    def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
+        scores = []
+        scores_dict: Any = {}
+        percentage = None
+        answers = {}
+        try:
+            if self.data.task == "" and "--mock" in sys.argv:
+                scores = [1.0]
+                answers = {"mock": "This is a mock answer"}
+            elif isinstance(self.data.ground, Ground):
+                files_contents = self.get_artifacts_out(
+                    config["workspace"], self.data.ground
+                )
+                answers = {"answer": files_contents}
+                for file_content in files_contents:
+                    score = self.scoring(config, file_content, self.data.ground)
+                    print("\033[1;32mYour score is:\033[0m", score)
+                    scores.append(score)
+
+                if self.data.ground.eval.type == "llm":
+                    llm_eval = self.llm_eval(
+                        config, "\n".join(files_contents), self.data.ground
+                    )
+                    if self.data.ground.eval.scoring == "percentage":
+                        scores.append(math.ceil(llm_eval / 100))
+                    elif self.data.ground.eval.scoring == "scale":
+                        scores.append(math.ceil(llm_eval / 10))
+                    print("\033[1;32mYour score is:\033[0m", llm_eval)
+
+                    scores.append(llm_eval)
+        except Exception as e:
+            print("Error getting scores", e)
+
+        scores_data = {
+            "values": scores,
+            "scores_obj": scores_dict,
+            "percentage": percentage,
+            "answers": answers,
+        }
+
+        self.scores[self.__class__.__name__] = scores_data
+
+        return scores_data
+
+    def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
+        return 1  # remove this once this works
+        if 1 in scores.get("scores_obj", {}).get(test_name, []):
+            return 1
+
+        return None
+
+    def skip_optional_categories(self, config: Dict[str, Any]) -> None:
+        challenge_category = self.data.category
+        categories = [
+            category
+            for category in OPTIONAL_CATEGORIES
+            if category in challenge_category
+        ]
+        if not agent_eligibible_for_optional_categories(
+            categories, config.get("category", [])
+        ):
+            pytest.skip("Agent is not eligible for this category")