From 8aa6452cc4c76610597ae56f90d5af91170cd1eb Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 17 Jul 2023 11:24:16 -0400
Subject: [PATCH] file naming when --test (#164)

---
 agbenchmark/reports/internal_info.json        | 110 +++++++++++-------
 .../reports/mini-agi/1.1_TestWriteFile.json   |  36 ++++++
 .../reports/mini-agi/1_TestWriteFIle.json     |  27 +++++
 .../reports/mini-agi/2.1_TestReadFile.json    |  27 +++++
 .../reports/mini-agi/2_TestReadFile.json      |  27 +++++
 .../reports/mini-agi/3_TestSearch.json        |  27 +++++
 .../4.1_TestDebugSimpleTypoWithGuidance.json  |  28 +++++
 .../4_TestDebugSimpleTypoWithGuidance.json    |  28 +++++
 .../reports/mini-agi/file1_07-16-13-07.json   |  23 ----
 agbenchmark/utils.py                          |  52 +++++++--
 agent/mini-agi                                |   2 +-
 11 files changed, 315 insertions(+), 72 deletions(-)
 create mode 100644 agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
 create mode 100644 agbenchmark/reports/mini-agi/1_TestWriteFIle.json
 create mode 100644 agbenchmark/reports/mini-agi/2.1_TestReadFile.json
 create mode 100644 agbenchmark/reports/mini-agi/2_TestReadFile.json
 create mode 100644 agbenchmark/reports/mini-agi/3_TestSearch.json
 create mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
 delete mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json

diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
index 97b525c0..0bfad744 100644
--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -1,40 +1,72 @@
 {
-  "mini-agi": {
-    "TestBasicMemory": [true, true, true],
-    "TestBasicRetrieval": [true, true, true],
-    "TestCreateSimpleWebServer": [false, false, false],
-    "TestDebugSimpleTypoWithGuidance": [
-      false,
-      false,
-      false,
-      false,
-      false,
-      false
-    ],
-    "TestDebugSimpleTypoWithoutGuidance": [false, false, false],
-    "TestReadFile": [true, true, true, true],
-    "TestRememberMultipleIds": [true, true, true],
-    "TestRememberMultipleIdsWithNoise": [true, true, true],
-    "TestRememberMultiplePhrasesWithNoise": [true, true, true],
-    "TestRetrieval2": [true, true, true],
-    "TestRetrieval3": [true, true, true],
-    "TestSearch": [true, true, true, true],
-    "TestWriteFile": [
-      true,
-      true,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true,
-      false,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true
-    ]
-  }
-}
+    "mini-agi": {
+        "TestBasicMemory": [
+            true,
+            true,
+            true
+        ],
+        "TestBasicRetrieval": [
+            true,
+            true,
+            true
+        ],
+        "TestCreateSimpleWebServer": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithoutGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestReadFile": [
+            true,
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIds": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIdsWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultiplePhrasesWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval2": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval3": [
+            true,
+            true,
+            true
+        ],
+        "TestSearch": [
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestWriteFile": [
+            true,
+            true,
+            true
+        ]
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
new file mode 100644
index 00000000..637c2d5c
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
@@ -0,0 +1,36 @@
+{
+    "TestWriteFile": {
+        "data_path": "agbenchmark/challenges/interface/write_file",
+        "is_regression": true,
+        "metrics": {
+            "difficulty": "interface",
+            "success": true,
+            "non_mock_success_%": 100.0,
+            "run_time": "0.009 seconds"
+        }
+    },
+    "additional": {
+        "model": "gpt-3.5-turbo"
+    },
+    "command": "agbenchmark start --test TestWriteFile",
+    "completion_time": "2023-07-17-09:54",
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    },
+    "metrics": {
+        "run_time": "22.36 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 40.0,
+                "run_time": "22.169 seconds"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
new file mode 100644
index 00000000..e6478319
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestWriteFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "12.4 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestWriteFile": {
+      "data_path": "agbenchmark/challenges/interface/write_file",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "12.127 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
new file mode 100644
index 00000000..b5d73af9
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-17-10:12",
+  "metrics": {
+    "run_time": "65.27 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "65.074 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4",
+    "reached_termination_time": true
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json
new file mode 100644
index 00000000..869eaaac
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "31.2 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "30.903 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json
new file mode 100644
index 00000000..d9d05db4
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/3_TestSearch.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestSearch",
+  "completion_time": "2023-07-15-22:14",
+  "metrics": {
+    "run_time": "16.88 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestSearch": {
+      "data_path": "agbenchmark/challenges/interface/search",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "16.572 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 00000000..d72d599d
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:16",
+  "metrics": {
+    "run_time": "45.92 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "45.599 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 00000000..7985a784
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:15",
+  "metrics": {
+    "run_time": "32.99 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "32.582 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
deleted file mode 100644
index 78bafc5f..00000000
--- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "command": "agbenchmark start --test TestWriteFile",
-    "completion_time": "2023-07-16-13:07",
-    "metrics": {
-        "run_time": "13.91 seconds",
-        "highest_difficulty": "interface: 1"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 30.0,
-                "run_time": "13.684 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-    }
-}
\ No newline at end of file
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index e99a1fa0..5f1bb30d 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,7 +1,9 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
+import math
 import os
 import re
+import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV")
 
 
 def calculate_info_test_path(reports_path: Path) -> str:
+    command = sys.argv
+
     if not reports_path.exists():
         reports_path.mkdir(parents=True, exist_ok=True)
-        return str(
-            reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        )
-    else:
-        json_files = glob.glob(str(reports_path / "*.json"))
-        file_count = len(json_files)
-        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        new_file_path = reports_path / run_name
-        return str(new_file_path)
+
+    json_files = glob.glob(str(reports_path / "*.json"))
+
+    # Default naming scheme
+    file_count = len(json_files)
+    run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+
+    # # If "--test" is in command
+    if "--test" in command:
+        test_index = command.index("--test")
+        try:
+            test_arg = command[test_index + 1]  # Argument after --test
+        except IndexError:
+            raise ValueError("Expected an argument after --test")
+
+        # Get all files that include the string that is the argument after --test
+        related_files = [f for f in json_files if test_arg in f]
+        related_file_count = len(related_files)
+
+        # Determine the prefix based on the existing files
+        if related_file_count == 0:
+            # Try to find the highest prefix number among all files, then increment it
+            all_prefix_numbers = []
+            for f in json_files:
+                number = float(Path(f).stem.split("_")[0])
+                all_prefix_numbers.append(math.floor(number))
+
+            max_prefix = max(all_prefix_numbers, default=0)
+            print("HEY WE ARE HERE BIG DAWG", max_prefix)
+            run_name = f"{max_prefix + 1}_{test_arg}.json"
+        else:
+            # Take the number from before the _ and add the .{number}
+            prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0]
+            prefix = math.floor(float(prefix_str))
+            run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
+
+    print("run_namerun_namerun_name", run_name)
+    new_file_path = reports_path / run_name
+    return str(new_file_path)
 
 
 def replace_backslash(value: Any) -> Any:
diff --git a/agent/mini-agi b/agent/mini-agi
index bb02bf0d..0a9fcd8c 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011
+Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d