import glob import math import os import subprocess import sys from abc import ABC from pathlib import Path from typing import Any, Dict, List import openai import pytest from agbenchmark.__main__ import OPTIONAL_CATEGORIES from agbenchmark.agent_api_interface import run_api_agent from agbenchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, Ground from agbenchmark.utils.prompts import ( END_PROMPT, FEW_SHOT_EXAMPLES, PROMPT_MAP, SCORING_MAP, ) from agbenchmark.utils.utils import agent_eligibible_for_optional_categories class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" _data_cache: Dict[str, ChallengeData] = {} CHALLENGE_LOCATION: str = "" scores: dict[str, Any] = {} # this is for suites @property def data(self) -> ChallengeData: if self.CHALLENGE_LOCATION not in self._data_cache: self._data_cache[self.CHALLENGE_LOCATION] = ChallengeData.deserialize( self.CHALLENGE_LOCATION ) return self._data_cache[self.CHALLENGE_LOCATION] @property def task(self) -> str: return self.data.task @property def dependencies(self) -> list: return self.data.dependencies async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent artifact_paths = [ self.ARTIFACTS_LOCATION, str(Path(self.CHALLENGE_LOCATION).parent), ] for path in artifact_paths: copy_artifacts_into_workspace(config["workspace"], "artifacts_in", path) if not self.task: return print( f"\033[1;35m============Starting {self.data.name} challenge============\033[0m" ) print(f"\033[1;30mTask: {self.task}\033[0m") if "--api_mode" in sys.argv: await run_api_agent(self.data, config, self.ARTIFACTS_LOCATION, cutoff) elif "--mock" in sys.argv: print("Running mock agent") for path in artifact_paths: copy_artifacts_into_workspace( config["workspace"], "artifacts_out", path ) else: agent_benchmark_config: AgentBenchmarkConfig = config[ "AgentBenchmarkConfig" ] run_agent(self.task, cutoff, agent_config=agent_benchmark_config) # hidden files are added after the agent runs. Hidden files can be python test files. # We copy them in the workspace to make it easy to import the code produced by the agent for path in artifact_paths: copy_artifacts_into_workspace(config["workspace"], "custom_python", path) def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError @staticmethod def open_file(workspace: str, filename: str) -> str: script_dir = workspace workspace_dir = os.path.join(script_dir, filename) with open(workspace_dir, "r") as f: return f.read() def get_artifacts_out( self, workspace: str | dict[str, str], ground: Ground ) -> List[str]: if isinstance(workspace, dict): workspace = workspace["output"] script_dir = workspace files_contents = [] for file_pattern in ground.files: # Check if it is a file extension if file_pattern.startswith("."): # Find all files with the given extension in the workspace matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) else: # Otherwise, it is a specific file matching_files = [os.path.join(script_dir, file_pattern)] for file_path in matching_files: if ground.eval.type == "python": result = subprocess.run( [sys.executable, file_path], cwd=os.path.abspath(workspace), capture_output=True, text=True, ) if "error" in result.stderr or result.returncode != 0: print(result.stderr) assert False, result.stderr files_contents.append(f"Output: {result.stdout}\n") else: with open(file_path, "r") as f: files_contents.append(f.read()) else: if ground.eval.type == "pytest": result = subprocess.run( [sys.executable, "-m", "pytest"], cwd=os.path.abspath(workspace), capture_output=True, text=True, ) if "error" in result.stderr or result.returncode != 0: print(result.stderr) assert False, result.stderr files_contents.append(f"Output: {result.stdout}\n") return files_contents @staticmethod def write_to_file(workspace: str, filename: str, content: str) -> None: script_dir = workspace print("Writing file at", script_dir) workspace_dir = os.path.join(script_dir, filename) # Open the file in write mode. with open(workspace_dir, "w") as f: # Write the content to the file. f.write(content) def get_filenames_in_workspace(self, workspace: str) -> List[str]: return [ filename for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float: print("\033[1;34mScoring content:\033[0m", content) if ground.should_contain: for should_contain_word in ground.should_contain: print_content = ( f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:" ) if should_contain_word not in content: print(print_content, "False") return 0.0 else: print(print_content, "True") if ground.should_not_contain: for should_not_contain_word in ground.should_not_contain: print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:" if should_not_contain_word in content: print(print_content, "False") return 0.0 else: print(print_content, "True") return 1.0 def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float: openai.api_key = os.getenv("OPENAI_API_KEY") if "--mock" in sys.argv: return 1.0 # the validation for this is done in the Eval BaseModel scoring = SCORING_MAP[ground.eval.scoring] # type: ignore prompt = PROMPT_MAP[ground.eval.template].format(task=self.data.task, scoring=scoring, answer=ground.answer, response=content) # type: ignore if ground.eval.examples: prompt += FEW_SHOT_EXAMPLES.format(examples=ground.eval.examples) prompt += END_PROMPT answer = openai.ChatCompletion.create( model="gpt-4", messages=[ {"role": "system", "content": prompt}, ], ) return float(answer["choices"][0]["message"]["content"]) # type: ignore def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]: scores = [] scores_dict: Any = {} percentage = None answers = {} try: if self.data.task == "" and "--mock" in sys.argv: scores = [1.0] answers = {"mock": "This is a mock answer"} elif isinstance(self.data.ground, Ground): files_contents = self.get_artifacts_out( config["workspace"], self.data.ground ) answers = {"answer": files_contents} for file_content in files_contents: score = self.scoring(config, file_content, self.data.ground) print("\033[1;32mYour score is:\033[0m", score) scores.append(score) if self.data.ground.eval.type == "llm": llm_eval = self.llm_eval( config, "\n".join(files_contents), self.data.ground ) if self.data.ground.eval.scoring == "percentage": scores.append(math.ceil(llm_eval / 100)) elif self.data.ground.eval.scoring == "scale": scores.append(math.ceil(llm_eval / 10)) print("\033[1;32mYour score is:\033[0m", llm_eval) scores.append(llm_eval) except Exception as e: print("Error getting scores", e) scores_data = { "values": scores, "scores_obj": scores_dict, "percentage": percentage, "answers": answers, } self.scores[self.__class__.__name__] = scores_data return scores_data def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None: return 1 # remove this once this works if 1 in scores.get("scores_obj", {}).get(test_name, []): return 1 return None def skip_optional_categories(self, config: Dict[str, Any]) -> None: challenge_category = self.data.category categories = [ category for category in OPTIONAL_CATEGORIES if category in challenge_category ] if not agent_eligibible_for_optional_categories( categories, config.get("category", []) ): pytest.skip("Agent is not eligible for this category")