file naming when --test (#164)

2026-01-21 23:14:24 +01:00 · 2023-07-17 11:24:16 -04:00
parent dffc1dfd51
commit 8aa6452cc4
11 changed files with 315 additions and 72 deletions
--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -1,40 +1,72 @@
 {
-  "mini-agi": {
-    "TestBasicMemory": [true, true, true],
-    "TestBasicRetrieval": [true, true, true],
-    "TestCreateSimpleWebServer": [false, false, false],
-    "TestDebugSimpleTypoWithGuidance": [
-      false,
-      false,
-      false,
-      false,
-      false,
-      false
-    ],
-    "TestDebugSimpleTypoWithoutGuidance": [false, false, false],
-    "TestReadFile": [true, true, true, true],
-    "TestRememberMultipleIds": [true, true, true],
-    "TestRememberMultipleIdsWithNoise": [true, true, true],
-    "TestRememberMultiplePhrasesWithNoise": [true, true, true],
-    "TestRetrieval2": [true, true, true],
-    "TestRetrieval3": [true, true, true],
-    "TestSearch": [true, true, true, true],
-    "TestWriteFile": [
-      true,
-      true,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true,
-      false,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true
-    ]
-  }
-}
+    "mini-agi": {
+        "TestBasicMemory": [
+            true,
+            true,
+            true
+        ],
+        "TestBasicRetrieval": [
+            true,
+            true,
+            true
+        ],
+        "TestCreateSimpleWebServer": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithoutGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestReadFile": [
+            true,
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIds": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIdsWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultiplePhrasesWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval2": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval3": [
+            true,
+            true,
+            true
+        ],
+        "TestSearch": [
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestWriteFile": [
+            true,
+            true,
+            true
+        ]
+    }
+}
--- a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
+++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
@@ -0,0 +1,36 @@
+{
+    "TestWriteFile": {
+        "data_path": "agbenchmark/challenges/interface/write_file",
+        "is_regression": true,
+        "metrics": {
+            "difficulty": "interface",
+            "success": true,
+            "non_mock_success_%": 100.0,
+            "run_time": "0.009 seconds"
+        }
+    },
+    "additional": {
+        "model": "gpt-3.5-turbo"
+    },
+    "command": "agbenchmark start --test TestWriteFile",
+    "completion_time": "2023-07-17-09:54",
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    },
+    "metrics": {
+        "run_time": "22.36 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 40.0,
+                "run_time": "22.169 seconds"
+            }
+        }
+    }
+}
--- a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
+++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestWriteFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "12.4 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestWriteFile": {
+      "data_path": "agbenchmark/challenges/interface/write_file",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "12.127 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
--- a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-17-10:12",
+  "metrics": {
+    "run_time": "65.27 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "65.074 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4",
+    "reached_termination_time": true
+  }
+}
--- a/agbenchmark/reports/mini-agi/2_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "31.2 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "30.903 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
--- a/agbenchmark/reports/mini-agi/3_TestSearch.json
+++ b/agbenchmark/reports/mini-agi/3_TestSearch.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestSearch",
+  "completion_time": "2023-07-15-22:14",
+  "metrics": {
+    "run_time": "16.88 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestSearch": {
+      "data_path": "agbenchmark/challenges/interface/search",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "16.572 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
--- a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
+++ b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:16",
+  "metrics": {
+    "run_time": "45.92 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "45.599 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
--- a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
+++ b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:15",
+  "metrics": {
+    "run_time": "32.99 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "32.582 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
--- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
+++ b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
@@ -1,23 +0,0 @@
-{
-    "command": "agbenchmark start --test TestWriteFile",
-    "completion_time": "2023-07-16-13:07",
-    "metrics": {
-        "run_time": "13.91 seconds",
-        "highest_difficulty": "interface: 1"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 30.0,
-                "run_time": "13.684 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-    }
-}
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,7 +1,9 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
+import math
 import os
 import re
+import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV")


 def calculate_info_test_path(reports_path: Path) -> str:
+    command = sys.argv
+
    if not reports_path.exists():
        reports_path.mkdir(parents=True, exist_ok=True)
-        return str(
-            reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        )
-    else:
-        json_files = glob.glob(str(reports_path / "*.json"))
-        file_count = len(json_files)
-        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        new_file_path = reports_path / run_name
-        return str(new_file_path)
+
+    json_files = glob.glob(str(reports_path / "*.json"))
+
+    # Default naming scheme
+    file_count = len(json_files)
+    run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+
+    # # If "--test" is in command
+    if "--test" in command:
+        test_index = command.index("--test")
+        try:
+            test_arg = command[test_index + 1]  # Argument after --test
+        except IndexError:
+            raise ValueError("Expected an argument after --test")
+
+        # Get all files that include the string that is the argument after --test
+        related_files = [f for f in json_files if test_arg in f]
+        related_file_count = len(related_files)
+
+        # Determine the prefix based on the existing files
+        if related_file_count == 0:
+            # Try to find the highest prefix number among all files, then increment it
+            all_prefix_numbers = []
+            for f in json_files:
+                number = float(Path(f).stem.split("_")[0])
+                all_prefix_numbers.append(math.floor(number))
+
+            max_prefix = max(all_prefix_numbers, default=0)
+            print("HEY WE ARE HERE BIG DAWG", max_prefix)
+            run_name = f"{max_prefix + 1}_{test_arg}.json"
+        else:
+            # Take the number from before the _ and add the .{number}
+            prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0]
+            prefix = math.floor(float(prefix_str))
+            run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
+
+    print("run_namerun_namerun_name", run_name)
+    new_file_path = reports_path / run_name
+    return str(new_file_path)


 def replace_backslash(value: Any) -> Any:
--- a/agent/mini-agi
+++ b/agent/mini-agi