diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml index f58e7fa3..f7cdeac5 100644 --- a/.github/workflows/benchmark-ci.yml +++ b/.github/workflows/benchmark-ci.yml @@ -240,6 +240,8 @@ jobs: poetry run uvicorn server:app --reload & sleep 5 export AGENT_NAME=mini-agi + echo "poetry run agbenchmark start --mock --api_mode --host=http://localhost:8000" + poetry run agbenchmark start --mock --api_mode --host=http://localhost:8000 else echo "${prefix}agbenchmark start" ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/agbenchmark/generate_test.py index a26b1987..b4d6b201 100644 --- a/benchmark/agbenchmark/generate_test.py +++ b/benchmark/agbenchmark/generate_test.py @@ -231,10 +231,13 @@ def generate_tests() -> None: # sourcery skip: invert-any-all # for suites to know if the file has already been used to generate the tests # Dynamic class creation + while json_files: json_file = ( json_files.popleft() ) # Take and remove the first element from json_files + if challenge_should_be_ignored(json_file): + continue data = ChallengeData.get_json_from_path(json_file) suite_config = SuiteConfig.suite_data_if_suite(Path(json_file)) @@ -293,4 +296,8 @@ def generate_tests() -> None: # sourcery skip: invert-any-all print(f"Generated test for {data['name']}.") +def challenge_should_be_ignored(json_file): + return "challenges/deprecated" in json_file or "challenges/library" in json_file + + generate_tests() diff --git a/benchmark/agbenchmark/start_benchmark.py b/benchmark/agbenchmark/start_benchmark.py index ae52cc75..a94ae234 100644 --- a/benchmark/agbenchmark/start_benchmark.py +++ b/benchmark/agbenchmark/start_benchmark.py @@ -95,7 +95,8 @@ def run_benchmark( test: Optional[str] = None, suite: Optional[str] = None, cutoff: Optional[int] = None, - server: bool = False, + api_mode: bool = False, + host: Optional[str] = None, ) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty @@ -132,7 +133,12 @@ def run_benchmark( config = json.load(f) else: config = {} - + host = host or config.get("host") + api_mode = api_mode or config.get("api_mode") + if host: + config["host"] = host + if api_mode: + config["api_mode"] = api_mode print("benchmark run path", CONFIG_PATH, HOME_DIRECTORY) if not config.get("workspace"): config["workspace"] = click.prompt( @@ -141,7 +147,7 @@ def run_benchmark( show_default=True, ) - if config.get("api_mode") and not config.get("host"): + if api_mode and not host: config["host"] = click.prompt( "Please enter the Agent API host address", default="http://localhost:8000", @@ -195,7 +201,10 @@ def run_benchmark( elif explore: print("Only attempt challenges that have never been beaten") pytest_args.append("--explore") - + if host: + pytest_args.append(f"--host={host}") + if api_mode: + pytest_args.append("--api_mode") if mock: pytest_args.append("--mock") @@ -215,6 +224,8 @@ def run_benchmark( print(f"Setting cuttoff override to {cutoff} seconds.") pytest_args.extend((str(CURRENT_DIRECTORY), "--cache-clear")) + pytest_args.append("--disable-warnings") + return pytest.main(pytest_args) @@ -249,6 +260,8 @@ def cli() -> None: ) @click.option("--nc", is_flag=True, help="Run without cutoff") @click.option("--cutoff", help="Set or override tests cutoff (seconds)") +@click.option("--api_mode", help="API mode") +@click.option("--host", help="Define API host") def start( maintain: bool, improve: bool, @@ -262,6 +275,8 @@ def start( suite: Optional[str] = None, cutoff: Optional[int] = None, backend: Optional[bool] = False, + api_mode: bool = False, + host: Optional[str] = None, ) -> Any: # Redirect stdout if backend is True original_stdout = sys.stdout # Save the original standard output @@ -282,6 +297,8 @@ def start( test=test, suite=suite, cutoff=cutoff, + api_mode=api_mode, + host=host, ) sys.stdout = original_stdout @@ -404,4 +421,4 @@ def get_regression_data() -> Any: # if __name__ == "__main__": -# start() +# start() diff --git a/benchmark/poetry.lock b/benchmark/poetry.lock index 129e111f..7d943095 100644 --- a/benchmark/poetry.lock +++ b/benchmark/poetry.lock @@ -152,6 +152,27 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" +[[package]] +name = "anyio" +version = "3.7.1" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.7" +files = [ + {file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"}, + {file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"}, +] + +[package.dependencies] +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" + +[package.extras] +doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"] +test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (<0.22)"] + [[package]] name = "appnope" version = "0.1.3" @@ -617,6 +638,26 @@ files = [ [package.extras] tests = ["asttokens", "littleutils", "pytest", "rich"] +[[package]] +name = "fastapi" +version = "0.103.1" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = false +python-versions = ">=3.7" +files = [ + {file = "fastapi-0.103.1-py3-none-any.whl", hash = "sha256:5e5f17e826dbd9e9b5a5145976c5cd90bcaa61f2bf9a69aca423f2bcebe44d83"}, + {file = "fastapi-0.103.1.tar.gz", hash = "sha256:345844e6a82062f06a096684196aaf96c1198b25c06b72c1311b882aa2d8a35d"}, +] + +[package.dependencies] +anyio = ">=3.7.1,<4.0.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" +starlette = ">=0.27.0,<0.28.0" +typing-extensions = ">=4.5.0" + +[package.extras] +all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] + [[package]] name = "filelock" version = "3.12.3" @@ -2070,6 +2111,20 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-multipart" +version = "0.0.6" +description = "A streaming multipart parser for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "python_multipart-0.0.6-py3-none-any.whl", hash = "sha256:ee698bab5ef148b0a760751c261902cd096e57e10558e11aca17646b74ee1c18"}, + {file = "python_multipart-0.0.6.tar.gz", hash = "sha256:e9925a80bb668529f1b67c7fdb0a5dacdd7cbfc6fb0bff3ea443fe22bdd62132"}, +] + +[package.extras] +dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatch", "invoke (==1.7.3)", "more-itertools (==4.3.0)", "pbr (==4.3.0)", "pluggy (==1.0.0)", "py (==1.11.0)", "pytest (==7.2.0)", "pytest-cov (==4.0.0)", "pytest-timeout (==2.1.0)", "pyyaml (==5.1)"] + [[package]] name = "pytz" version = "2023.3.post1" @@ -2295,6 +2350,23 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "starlette" +version = "0.27.0" +description = "The little ASGI library that shines." +optional = false +python-versions = ">=3.7" +files = [ + {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, + {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] + [[package]] name = "tomli" version = "2.0.1" @@ -2597,4 +2669,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "c97e1a4310f7d362f157f164b01393eb21fd182e197384c7867cfe002ea46506" +content-hash = "e86dcefdd1198516ad76fafb4877fd46f5b8623a7be52e069c3ac39509ba7c4e" diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index 0bc50af0..88740f4b 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -29,6 +29,8 @@ selenium = "^4.11.2" agent-protocol-client = "^0.2.2" pytest-asyncio = "^0.21.1" uvicorn = "^0.23.2" +fastapi = "^0.103.1" +python-multipart = "^0.0.6" [tool.poetry.group.dev.dependencies] diff --git a/benchmark/server.py b/benchmark/server.py index 5537a6ef..e7f639eb 100644 --- a/benchmark/server.py +++ b/benchmark/server.py @@ -17,10 +17,6 @@ logger = logging.getLogger(__name__) app = FastAPI() artifacts: List[Dict[str, Any]] = [] -with open("agent/gpt-engineer/agbenchmark/config.json", "r") as file: - config = json.load(file) - logger.info("Loaded configuration") - class Task(BaseModel): input: str @@ -34,7 +30,11 @@ async def upload_file( "Uploading file for task_id: %s with relative path: %s", task_id, relative_path ) absolute_directory_path = Path(__file__).parent.absolute() - save_path = absolute_directory_path / "agent/gpt-engineer" / config["workspace"] + save_path = ( + absolute_directory_path + / "agent/gpt-engineer" + / "projects/my-new-project/workspace" + ) random_string = str(randint(0, 100000)) while random_string in artifacts: