diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt new file mode 100644 index 00000000..1b1e0147 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt new file mode 100644 index 00000000..92203f5c --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt new file mode 100644 index 00000000..d7f6f08f --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt new file mode 100644 index 00000000..6c0a13ba --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt new file mode 100644 index 00000000..729f0aa0 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the phrases previously mentioned to a file called file_to_check.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json new file mode 100644 index 00000000..e1ecb16f --- /dev/null +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json @@ -0,0 +1,26 @@ +{ + "name": "remember_multiple_phrases_with_noise_mock", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "should_contain": [ + "The purple elephant danced on a rainbow while eating a taco", + "The sneaky toaster stole my socks and ran away to Hawaii", + "My pet rock sings better than Beyoncé on Tuesdays", + "The giant hamster rode a unicycle through the crowded mall" + ], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "remember_multiple_phrases_with_noise_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py new file mode 100644 index 00000000..fd33da1c --- /dev/null +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -0,0 +1,32 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "remember_multiple_phrases_with_noise_data.json" + ) + + @pytest.mark.depends( + name="test_remember_multiple_phrases_with_noise", + depends=["test_remember_multiple_ids_with_noise"], + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 1ffb3de3..37ded0ae 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -77,3 +77,14 @@ def remember_multiple_ids_mock(task: str, workspace: str) -> None: "file_to_check.txt", "3145\n3791\n9317\n9471", ) + + +def remember_multiple_phrases_with_noise_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + ) diff --git a/regression_tests.json b/regression_tests.json index 3b91a5c9..1195efbc 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -40,5 +40,10 @@ "difficulty": "medium", "dependencies": [], "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" } } \ No newline at end of file