Many small improvements, new benchmark results!

2025-12-17 12:45:26 +01:00 · 2023-06-21 00:37:18 +02:00
parent 4c68126295
commit 16dd96f50d
6 changed files with 97 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,7 @@ archive
 # any log file
 *log.txt
 todo
 scratchpad
 # Ignore GPT Engineer files
 projects
--- a/benchmark/RESULTS.md
+++ b/benchmark/RESULTS.md
@@ -4,6 +4,36 @@
 $ python scripts/benchmark.py
 ```
 # 2023-06-21
 | Benchmark          | Ran | Works | Perfect |
 |--------------------|-----|-------|---------|
 | currency_converter | ✅  | ❌    | ❌      |
 | image_resizer      | ✅  | ✅    | ✅      |
 | pomodoro_timer     | ✅  | ✅    | ✅      |
 | url_shortener      | ✅  | ✅    | ✅      |
 | file_explorer      | ✅  | ✅    | ✅      |
 | markdown_editor    | ✅  | ✅    | ❌      |
 | timer_app          | ✅  | ❌    | ❌      |
 | weather_app        | ✅  | ✅    | ✅      |
 | file_organizer     | ✅  | ✅    | ✅      |
 | password_generator | ✅  | ✅    | ✅      |
 | todo_list          | ✅  | ✅    | ✅      |
 # Notes on the errors
 Most errors come from that the "generate entrypoint" are incorrect. Ignoring
 those, we get 8/11 fully correct.
 All errors are very easy to fix.
 One error was trying to modify a constant.
 One error was that the html template was not fully filled in.
 One error is that a dependency was used incorrectly and easy to fix
 # 2023-06-19
 | Benchmark          | Ran | Works | Perfect |
 |--------------------|-----|-------|---------|
 | currency_converter | ❌  | ❌    | ❌      |
--- a/gpt_engineer/main.py
+++ b/gpt_engineer/main.py
@@ -7,6 +7,7 @@ from pathlib import Path
 import typer
 from gpt_engineer import steps
 from gpt_engineer.ai import AI
 from gpt_engineer.db import DB, DBs
 from gpt_engineer.steps import STEPS
@@ -20,7 +21,9 @@ def main(
    delete_existing: bool = typer.Argument(False, help="delete existing files"),
    model: str = "gpt-4",
    temperature: float = 0.1,
-    steps_config: str = "default",
+    steps_config: steps.Config = typer.Option(
        steps.Config.DEFAULT, "--steps", "-s", help="decide which steps to run"
    ),
    verbose: bool = typer.Option(False, "--verbose", "-v"),
    run_prefix: str = typer.Option(
        "",
--- a/gpt_engineer/steps.py
+++ b/gpt_engineer/steps.py
@@ -2,6 +2,8 @@ import json
 import re
 import subprocess
 from enum import Enum
 from gpt_engineer.ai import AI
 from gpt_engineer.chat_to_files import to_files
 from gpt_engineer.db import DBs
@@ -34,10 +36,10 @@ def clarify(ai: AI, dbs: DBs):
            break
        print()
-        user = input('(answer in text, or "q" to move on)\n')
+        user = input('(answer in text, or "c" to move on)\n')
        print()
-        if not user or user == "q":
+        if not user or user == "c":
            break
        user += (
@@ -145,7 +147,7 @@ def execute_entrypoint(ai, dbs):
    print()
    print('If yes, press enter. Otherwise, type "no"')
    print()
-    if input() != "":
+    if input() not in ["", "y", "yes"]:
        print("Ok, not executing the code.")
        return []
    print("Executing the code...")
@@ -183,7 +185,7 @@ def use_feedback(ai: AI, dbs: DBs):
        ai.fassistant(dbs.workspace["all_output.txt"]),
        ai.fsystem(dbs.identity["use_feedback"]),
    ]
-    messages = ai.next(messages, dbs.memory["feedback"])
+    messages = ai.next(messages, dbs.input["feedback"])
    to_files(messages[-1]["content"], dbs.workspace)
    return messages
@@ -201,13 +203,36 @@ def fix_code(ai: AI, dbs: DBs):
    return messages
 class Config(str, Enum):
    DEFAULT = "default"
    BENCHMARK = "benchmark"
    SIMPLE = "simple"
    TDD = "tdd"
    TDD_PLUS = "tdd+"
    CLARIFY = "clarify"
    RESPEC = "respec"
    EXECUTE_ONLY = "execute_only"
    USE_FEEDBACK = "use_feedback"
 # Different configs of what steps to run
 STEPS = {
-    "default": [simple_gen, gen_entrypoint, execute_entrypoint],
+    Config.DEFAULT: [
-    "benchmark": [simple_gen, gen_entrypoint],
+        clarify,
-    "simple": [simple_gen, gen_entrypoint, execute_entrypoint],
+        gen_clarified_code,
-    "tdd": [gen_spec, gen_unit_tests, gen_code, gen_entrypoint, execute_entrypoint],
+        gen_entrypoint,
-    "tdd+": [
+        execute_entrypoint,
    ],
    Config.BENCHMARK: [simple_gen, gen_entrypoint],
    Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
    Config.TDD: [
        gen_spec,
        gen_unit_tests,
        gen_code,
        gen_entrypoint,
        execute_entrypoint,
    ],
    Config.TDD_PLUS: [
        gen_spec,
        gen_unit_tests,
        gen_code,
@@ -215,8 +240,13 @@ STEPS = {
        gen_entrypoint,
        execute_entrypoint,
    ],
-    "clarify": [clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint],
+    Config.CLARIFY: [
-    "respec": [
+        clarify,
        gen_clarified_code,
        gen_entrypoint,
        execute_entrypoint,
    ],
    Config.RESPEC: [
        gen_spec,
        respec,
        gen_unit_tests,
@@ -224,12 +254,9 @@ STEPS = {
        gen_entrypoint,
        execute_entrypoint,
    ],
-    "execute_only": [execute_entrypoint],
+    Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
-    "use_feedback": [use_feedback],
+    Config.EXECUTE_ONLY: [gen_entrypoint, execute_entrypoint],
 }
 # Future steps that can be added:
-# self_reflect_and_improve_files,
+# run_tests_and_fix_files
 # add_tests
 # run_tests_and_fix_files,
 # improve_based_on_in_file_feedback_comments
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -35,7 +35,7 @@ def main(
                    "-m",
                    "gpt_engineer.main",
                    bench_folder,
-                    "--steps-config",
+                    "--steps",
                    "benchmark",
                ],
                stdout=log_file,
@@ -66,7 +66,7 @@ def main(
                    "-m",
                    "gpt_engineer.main",
                    bench_folder,
-                    "--steps-config",
+                    "--steps",
                    "execute_only",
                ],
            )
--- a/scripts/print_chat.py
+++ b/scripts/print_chat.py
@@ -16,6 +16,11 @@ def pretty_print_conversation(messages):
    }
    formatted_messages = []
    for message in messages:
        if message["role"] == "function":
            formatted_messages.append(
                f"function ({message['name']}): {message['content']}\n"
            )
        else:
            assistant_content = (
                message["function_call"]
                if message.get("function_call")
@@ -25,9 +30,7 @@ def pretty_print_conversation(messages):
                "system": f"system: {message['content']}\n",
                "user": f"user: {message['content']}\n",
                "assistant": f"assistant: {assistant_content}\n",
            "function": f"function ({message['name']}): {message['content']}\n",
            }
            formatted_messages.append(role_to_message[message["role"]])
    for formatted_message in formatted_messages: