Make sure benchmark runs and be more strict about collecting feedback (#397)

* Update benchmark script * Bump version
2026-01-31 10:45:50 +01:00 · 2023-06-25 15:53:34 +02:00
parent b09f0e9193
commit 7f16abecb3
4 changed files with 7 additions and 5 deletions
--- a/gpt_engineer/learning.py
+++ b/gpt_engineer/learning.py
@@ -59,7 +59,7 @@ def human_input() -> Review:
    print()

    ran = input("Did the generated code run at all? " + TERM_CHOICES)
-    while ran not in ("y", "n", "u", ""):
+    while ran not in ("y", "n", "u"):
        ran = input("Invalid input. Please enter y, n, or u: ")

    perfect = ""
@@ -69,12 +69,12 @@ def human_input() -> Review:
        perfect = input(
            "Did the generated code do everything you wanted? " + TERM_CHOICES
        )
-        while perfect not in ("y", "n", "u", ""):
+        while perfect not in ("y", "n", "u"):
            perfect = input("Invalid input. Please enter y, n, or u: ")

        if perfect != "y":
            useful = input("Did the generated code do anything useful? " + TERM_CHOICES)
-            while useful not in ("y", "n", "u", ""):
+            while useful not in ("y", "n", "u"):
                useful = input("Invalid input. Please enter y, n, or u: ")

    comments = ""
--- a/gpt_engineer/steps.py
+++ b/gpt_engineer/steps.py
@@ -273,6 +273,7 @@ class Config(str, Enum):
    CLARIFY = "clarify"
    RESPEC = "respec"
    EXECUTE_ONLY = "execute_only"
+    EVALUATE = "evaluate"
    USE_FEEDBACK = "use_feedback"


@@ -323,6 +324,7 @@ STEPS = {
    ],
    Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review],
    Config.EXECUTE_ONLY: [execute_entrypoint],
+    Config.EVALUATE: [execute_entrypoint, human_review],
 }

 # Future steps that can be added:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]

 [project]
 name = "gpt-engineer"
-version = "0.0.6"
+version = "0.0.7"
 description = "Specify what you want it to build, the AI asks for clarification, and then builds it."
 readme = "README.md"
 requires-python = ">=3"
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -68,7 +68,7 @@ def main(
                    "gpt_engineer.main",
                    bench_folder,
                    "--steps",
-                    "execute_only",
+                    "evaluate",
                ],
            )