From 8aa6452cc4c76610597ae56f90d5af91170cd1eb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 17 Jul 2023 11:24:16 -0400 Subject: [PATCH] file naming when --test (#164) --- agbenchmark/reports/internal_info.json | 110 +++++++++++------- .../reports/mini-agi/1.1_TestWriteFile.json | 36 ++++++ .../reports/mini-agi/1_TestWriteFIle.json | 27 +++++ .../reports/mini-agi/2.1_TestReadFile.json | 27 +++++ .../reports/mini-agi/2_TestReadFile.json | 27 +++++ .../reports/mini-agi/3_TestSearch.json | 27 +++++ .../4.1_TestDebugSimpleTypoWithGuidance.json | 28 +++++ .../4_TestDebugSimpleTypoWithGuidance.json | 28 +++++ .../reports/mini-agi/file1_07-16-13-07.json | 23 ---- agbenchmark/utils.py | 52 +++++++-- agent/mini-agi | 2 +- 11 files changed, 315 insertions(+), 72 deletions(-) create mode 100644 agbenchmark/reports/mini-agi/1.1_TestWriteFile.json create mode 100644 agbenchmark/reports/mini-agi/1_TestWriteFIle.json create mode 100644 agbenchmark/reports/mini-agi/2.1_TestReadFile.json create mode 100644 agbenchmark/reports/mini-agi/2_TestReadFile.json create mode 100644 agbenchmark/reports/mini-agi/3_TestSearch.json create mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json delete mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json index 97b525c0..0bfad744 100644 --- a/agbenchmark/reports/internal_info.json +++ b/agbenchmark/reports/internal_info.json @@ -1,40 +1,72 @@ { - "mini-agi": { - "TestBasicMemory": [true, true, true], - "TestBasicRetrieval": [true, true, true], - "TestCreateSimpleWebServer": [false, false, false], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false, - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [false, false, false], - "TestReadFile": [true, true, true, true], - "TestRememberMultipleIds": [true, true, true], - "TestRememberMultipleIdsWithNoise": [true, true, true], - "TestRememberMultiplePhrasesWithNoise": [true, true, true], - "TestRetrieval2": [true, true, true], - "TestRetrieval3": [true, true, true], - "TestSearch": [true, true, true, true], - "TestWriteFile": [ - true, - true, - true, - false, - false, - false, - false, - true, - false, - true, - false, - false, - false, - false, - true - ] - } -} + "mini-agi": { + "TestBasicMemory": [ + true, + true, + true + ], + "TestBasicRetrieval": [ + true, + true, + true + ], + "TestCreateSimpleWebServer": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [ + false, + false, + false + ], + "TestReadFile": [ + true, + true, + true, + true, + true + ], + "TestRememberMultipleIds": [ + true, + true, + true + ], + "TestRememberMultipleIdsWithNoise": [ + true, + true, + true + ], + "TestRememberMultiplePhrasesWithNoise": [ + true, + true, + true + ], + "TestRetrieval2": [ + true, + true, + true + ], + "TestRetrieval3": [ + true, + true, + true + ], + "TestSearch": [ + true, + true, + true, + true + ], + "TestWriteFile": [ + true, + true, + true + ] + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json new file mode 100644 index 00000000..637c2d5c --- /dev/null +++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json @@ -0,0 +1,36 @@ +{ + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.009 seconds" + } + }, + "additional": { + "model": "gpt-3.5-turbo" + }, + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-17-09:54", + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "metrics": { + "run_time": "22.36 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 40.0, + "run_time": "22.169 seconds" + } + } + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json new file mode 100644 index 00000000..e6478319 --- /dev/null +++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-15-22:13", + "metrics": { + "run_time": "12.4 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 50.0, + "run_time": "12.127 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json new file mode 100644 index 00000000..b5d73af9 --- /dev/null +++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestReadFile", + "completion_time": "2023-07-17-10:12", + "metrics": { + "run_time": "65.27 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "65.074 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4", + "reached_termination_time": true + } +} diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json new file mode 100644 index 00000000..869eaaac --- /dev/null +++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestReadFile", + "completion_time": "2023-07-15-22:13", + "metrics": { + "run_time": "31.2 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "30.903 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json new file mode 100644 index 00000000..d9d05db4 --- /dev/null +++ b/agbenchmark/reports/mini-agi/3_TestSearch.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestSearch", + "completion_time": "2023-07-15-22:14", + "metrics": { + "run_time": "16.88 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "16.572 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..d72d599d --- /dev/null +++ b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-15-22:16", + "metrics": { + "run_time": "45.92 seconds", + "highest_difficulty": ": 0" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "45.599 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 00000000..7985a784 --- /dev/null +++ b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-15-22:15", + "metrics": { + "run_time": "32.99 seconds", + "highest_difficulty": ": 0" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "32.582 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json deleted file mode 100644 index 78bafc5f..00000000 --- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "command": "agbenchmark start --test TestWriteFile", - "completion_time": "2023-07-16-13:07", - "metrics": { - "run_time": "13.91 seconds", - "highest_difficulty": "interface: 1" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 30.0, - "run_time": "13.684 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}" - } -} \ No newline at end of file diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index e99a1fa0..5f1bb30d 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,7 +1,9 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob +import math import os import re +import sys from datetime import datetime from pathlib import Path from typing import Any @@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV") def calculate_info_test_path(reports_path: Path) -> str: + command = sys.argv + if not reports_path.exists(): reports_path.mkdir(parents=True, exist_ok=True) - return str( - reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" - ) - else: - json_files = glob.glob(str(reports_path / "*.json")) - file_count = len(json_files) - run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" - new_file_path = reports_path / run_name - return str(new_file_path) + + json_files = glob.glob(str(reports_path / "*.json")) + + # Default naming scheme + file_count = len(json_files) + run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" + + # # If "--test" is in command + if "--test" in command: + test_index = command.index("--test") + try: + test_arg = command[test_index + 1] # Argument after --test + except IndexError: + raise ValueError("Expected an argument after --test") + + # Get all files that include the string that is the argument after --test + related_files = [f for f in json_files if test_arg in f] + related_file_count = len(related_files) + + # Determine the prefix based on the existing files + if related_file_count == 0: + # Try to find the highest prefix number among all files, then increment it + all_prefix_numbers = [] + for f in json_files: + number = float(Path(f).stem.split("_")[0]) + all_prefix_numbers.append(math.floor(number)) + + max_prefix = max(all_prefix_numbers, default=0) + print("HEY WE ARE HERE BIG DAWG", max_prefix) + run_name = f"{max_prefix + 1}_{test_arg}.json" + else: + # Take the number from before the _ and add the .{number} + prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0] + prefix = math.floor(float(prefix_str)) + run_name = f"{prefix}.{related_file_count}_{test_arg}.json" + + print("run_namerun_namerun_name", run_name) + new_file_path = reports_path / run_name + return str(new_file_path) def replace_backslash(value: Any) -> Any: diff --git a/agent/mini-agi b/agent/mini-agi index bb02bf0d..0a9fcd8c 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011 +Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d