diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index d5229e93..e457b85c 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -6,7 +6,7 @@ Input: - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. -- **dependencies** (str[]): The dependencies that the challenge needs to run. +- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. - **answer** (str): The raw text of the ground truth answer. - **should_contain** (list): The exact strings that are required in the final answer. @@ -23,18 +23,20 @@ Example: ```python { "category": ["basic"], - "task": "What is the capital of America?", + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_read_file_mock", "info": { - "difficulty": "easy", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 5e6d6abf..45becaf7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -from pytest_dependency import depends data = ChallengeData.deserialize( @@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, request, workspace): - depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 0cada86c..56320740 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.Challenge import Challenge -@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 4d04f33e..8c5ef62d 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,9 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ad08da4e..494a9b07 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.order(after=data.dependencies) + @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 4c94320e..0a4ef4a2 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index df27f312..57b94cd7 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 4764bf49..d7939fbf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -368,6 +368,20 @@ files = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "idna" version = "3.4" @@ -473,6 +487,24 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "openai" version = "0.27.8" @@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] -name = "pytest-dependency" -version = "0.5.1" -description = "Manage dependencies of tests" +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" optional = false python-versions = "*" files = [ - {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, ] [package.dependencies] -pytest = ">=3.6.0" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, - {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, -] - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-ordering" -version = "0.6" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = "*" -files = [ - {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, - {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, - {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, -] - -[package.dependencies] -pytest = "*" +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" [[package]] name = "requests" @@ -810,4 +814,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" +content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" diff --git a/pyproject.toml b/pyproject.toml index fd2c5204..0a4f8ba7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" -pytest-dependency = "^0.5.1" -pytest-ordering = "^0.6" -pytest-order = "^1.1.0" +pytest-depends = "^1.0.1" [build-system] @@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "--order-dependencies" # -ra -q +addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ]