Fix failing tests. Add review step.

2025-12-17 12:45:26 +01:00 · 2023-06-25 14:00:06 +02:00
parent 9b86678d61
commit ba33e681df
5 changed files with 203 additions and 96 deletions
--- a/gpt_engineer/collect.py
+++ b/gpt_engineer/collect.py
@@ -1,73 +1,12 @@
 import hashlib
 import json
 import os
 import random
 import tempfile
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import List
 from dataclasses_json import dataclass_json
 from gpt_engineer import steps
-from gpt_engineer.db import DB, DBs
+from gpt_engineer.db import DBs
-from gpt_engineer.steps import Step
+from gpt_engineer.domain import Step
-
+from gpt_engineer.learning import Learning, extract_learning
@dataclass_json
@dataclass
 class Learning:
    model: str
    temperature: float
    steps: str
    steps_file_hash: str
    prompt: str
    logs: str
    workspace: str
    feedback: str | None
    session: str
    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
    version: str = "0.2"
 def steps_file_hash():
    with open(steps.__file__, "r") as f:
        content = f.read()
        return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
 def logs_to_string(steps: List[Step], logs: DB):
    chunks = []
    for step in steps:
        chunks.append(f"--- {step.__name__} ---\n")
        messages = json.loads(logs[step.__name__])
        chunks.append(format_messages(messages))
    return "\n".join(chunks)
 def format_messages(messages: List[dict]) -> str:
    return "\n".join(
        [f"{message['role']}:\n\n{message['content']}" for message in messages]
    )
 def extract_learning(
    model: str, temperature: float, steps: List[Step], dbs: DBs
 ) -> Learning:
    learning = Learning(
        prompt=dbs.input["prompt"],
        model=model,
        temperature=temperature,
        steps=json.dumps([step.__name__ for step in steps]),
        steps_file_hash=steps_file_hash(),
        feedback=dbs.input.get("feedback"),
        session=get_session(),
        logs=logs_to_string(steps, dbs.logs),
        workspace=dbs.workspace["all_output.txt"],
    )
    return learning
 def send_learning(learning: Learning):
@@ -83,25 +22,18 @@ def send_learning(learning: Learning):
    )
 def get_session():
    path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
    try:
        if path.exists():
            user_id = path.read_text()
        else:
            # random uuid:
            user_id = str(random.randint(0, 2**32))
            path.write_text(user_id)
        return user_id
    except IOError:
        return "ephemeral_" + str(random.randint(0, 2**32))
 def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs):
    if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]:
        print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning")
        return
-    learnings = extract_learning(model, temperature, steps, dbs)
+    learnings = extract_learning(
        model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
    )
    send_learning(learnings)
 def steps_file_hash():
    with open(steps.__file__, "r") as f:
        content = f.read()
        return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
--- a/gpt_engineer/domain.py
+++ b/gpt_engineer/domain.py
@@ -0,0 +1,6 @@
 from typing import Callable, List, TypeVar
 from gpt_engineer.ai import AI
 from gpt_engineer.db import DBs
 Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
--- a/gpt_engineer/learning.py
+++ b/gpt_engineer/learning.py
@@ -0,0 +1,144 @@
 import json
 import random
 import tempfile
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import List, Optional
 from dataclasses_json import dataclass_json
 from termcolor import colored
 from gpt_engineer.db import DB, DBs
 from gpt_engineer.domain import Step
@dataclass_json
@dataclass
 class Review:
    ran: Optional[bool]
    perfect: Optional[bool]
    works: Optional[bool]
    comments: str
    raw: str
@dataclass_json
@dataclass
 class Learning:
    model: str
    temperature: float
    steps: str
    steps_file_hash: str
    prompt: str
    logs: str
    workspace: str
    feedback: Optional[str]
    session: str
    review: Optional[Review]
    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
    version: str = "0.3"
 TERM_CHOICES = (
    colored("y", "green")
    + "/"
    + colored("n", "red")
    + "/"
    + colored("u", "yellow")
    + "(ncertain): "
 )
 def human_input() -> Review:
    print()
    print(
        colored("To help gpt-engineer learn, please answer 3 questions:", "light_green")
    )
    print()
    ran = input("Did the generated code run at all? " + TERM_CHOICES)
    while ran not in ("y", "n", "u", ""):
        ran = input("Invalid input. Please enter y, n, or u: ")
    perfect = ""
    useful = ""
    if ran == "y":
        perfect = input(
            "Did the generated code do everything you wanted? " + TERM_CHOICES
        )
        while perfect not in ("y", "n", "u", ""):
            perfect = input("Invalid input. Please enter y, n, or u: ")
        if perfect != "y":
            useful = input("Did the generated code do anything useful? " + TERM_CHOICES)
            while useful not in ("y", "n", "u", ""):
                useful = input("Invalid input. Please enter y, n, or u: ")
    comments = ""
    if perfect != "y":
        comments = input(
            "If you have time, please explain what was not working "
            + colored("(ok to leave blank)\n", "light_green")
        )
    print(colored("Thank you", "light_green"))
    return Review(
        raw=", ".join([ran, perfect, useful]),
        ran={"y": True, "n": False, "u": None, "": None}[ran],
        works={"y": True, "n": False, "u": None, "": None}[useful],
        perfect={"y": True, "n": False, "u": None, "": None}[perfect],
        comments=comments,
    )
 def logs_to_string(steps: List[Step], logs: DB):
    chunks = []
    for step in steps:
        chunks.append(f"--- {step.__name__} ---\n")
        messages = json.loads(logs[step.__name__])
        chunks.append(format_messages(messages))
    return "\n".join(chunks)
 def format_messages(messages: List[dict]) -> str:
    return "\n".join(
        [f"{message['role']}:\n\n{message['content']}" for message in messages]
    )
 def extract_learning(
    model: str, temperature: float, steps: List[Step], dbs: DBs, steps_file_hash
 ) -> Learning:
    review = None
    if "review" in dbs.memory:
        review = Review.from_json(dbs.memory["review"])  # type: ignore
    learning = Learning(
        prompt=dbs.input["prompt"],
        model=model,
        temperature=temperature,
        steps=json.dumps([step.__name__ for step in steps]),
        steps_file_hash=steps_file_hash,
        feedback=dbs.input.get("feedback"),
        session=get_session(),
        logs=logs_to_string(steps, dbs.logs),
        workspace=dbs.workspace["all_output.txt"],
        review=review,
    )
    return learning
 def get_session():
    path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
    try:
        if path.exists():
            user_id = path.read_text()
        else:
            # random uuid:
            user_id = str(random.randint(0, 2**32))
            path.write_text(user_id)
        return user_id
    except IOError:
        return "ephemeral_" + str(random.randint(0, 2**32))
--- a/gpt_engineer/steps.py
+++ b/gpt_engineer/steps.py
@@ -3,13 +3,14 @@ import re
 import subprocess
 from enum import Enum
-from typing import Callable, List, TypeVar
+from typing import List
 from termcolor import colored
 from gpt_engineer.ai import AI
 from gpt_engineer.chat_to_files import to_files
 from gpt_engineer.db import DBs
 from gpt_engineer.learning import human_input
 def setup_sys_prompt(dbs: DBs) -> str:
@@ -34,8 +35,7 @@ def get_prompt(dbs: DBs) -> str:
    return dbs.input["prompt"]
-# All steps below have this signature
+# All steps below have the signature Step
 Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
 def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
@@ -54,6 +54,9 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
    while True:
        messages = ai.next(messages, user_input)
        if messages[-1]["content"].strip() == "Nothing more to clarify.":
            break
        if messages[-1]["content"].strip().lower().startswith("no"):
            print("Nothing more to clarify.")
            break
@@ -77,7 +80,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
            "Is anything else unclear? If yes, only answer in the form:\n"
            "{remaining unclear areas} remaining questions.\n"
            "{Next question}\n"
-            'If everything is sufficiently clear, only answer "no".'
+            'If everything is sufficiently clear, only answer "Nothing more to clarify.".'
        )
    print()
@@ -183,10 +186,11 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
    print("Executing the code...")
    print()
    print(
-        "\033[92m"  # green color
+        colored(
-        + "Note: If it does not work as expected, consider running the code"
+            "Note: If it does not work as expected, consider running the code"
-        + " in another way than above."
+            + " in another way than above.",
-        + "\033[0m"
+            "green",
        )
    )
    print()
    print("You can press ctrl+c *once* to stop the execution.")
@@ -196,16 +200,16 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
    try:
        p.wait()
    except KeyboardInterrupt:
        print("Stopping execution...")
        print()
        print("Stopping execution.")
        print("Execution stopped.")
        p.kill()
        print()
        print("Execution stopped.")
    return []
-def gen_entrypoint(ai, dbs):
+def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
    messages = ai.start(
        system=(
            "You will get information about a codebase that is currently on disk in "
@@ -254,6 +258,12 @@ def fix_code(ai: AI, dbs: DBs):
    return messages
 def human_review(ai: AI, dbs: DBs):
    review = human_input()
    dbs.memory["review"] = review.to_json()  # type: ignore
    return []
 class Config(str, Enum):
    DEFAULT = "default"
    BENCHMARK = "benchmark"
@@ -273,6 +283,7 @@ STEPS = {
        gen_clarified_code,
        gen_entrypoint,
        execute_entrypoint,
        human_review,
    ],
    Config.BENCHMARK: [simple_gen, gen_entrypoint],
    Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
@@ -282,6 +293,7 @@ STEPS = {
        gen_code,
        gen_entrypoint,
        execute_entrypoint,
        human_review,
    ],
    Config.TDD_PLUS: [
        gen_spec,
@@ -290,12 +302,14 @@ STEPS = {
        fix_code,
        gen_entrypoint,
        execute_entrypoint,
        human_review,
    ],
    Config.CLARIFY: [
        clarify,
        gen_clarified_code,
        gen_entrypoint,
        execute_entrypoint,
        human_review,
    ],
    Config.RESPEC: [
        gen_spec,
@@ -305,11 +319,12 @@ STEPS = {
        fix_code,
        gen_entrypoint,
        execute_entrypoint,
        human_review,
    ],
-    Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
+    Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review],
    Config.EXECUTE_ONLY: [execute_entrypoint],
 }
 # Future steps that can be added:
 # run_tests_and_fix_files
-# execute_entrypoint_and_fix_files_if_needed
+# execute_entrypoint_and_fix_files_if_it_results_in_error
--- a/tests/test_collect.py
+++ b/tests/test_collect.py
@@ -2,11 +2,13 @@ import json
 import os
 from unittest.mock import MagicMock
 import pytest
 import rudderstack.analytics as rudder_analytics
-from gpt_engineer.collect import collect_learnings, extract_learning
+from gpt_engineer.collect import collect_learnings, steps_file_hash
 from gpt_engineer.db import DB, DBs
 from gpt_engineer.learning import extract_learning
 from gpt_engineer.steps import gen_code
@@ -28,10 +30,18 @@ def test_collect_learnings(monkeypatch):
    collect_learnings(model, temperature, steps, dbs)
-    learnings = extract_learning(model, temperature, steps, dbs)
+    learnings = extract_learning(
        model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
    )
    assert rudder_analytics.track.call_count == 1
    assert rudder_analytics.track.call_args[1]["event"] == "learning"
-    assert rudder_analytics.track.call_args[1]["properties"] == learnings.to_dict()
+    a = {
        k: v
        for k, v in rudder_analytics.track.call_args[1]["properties"].items()
        if k != "timestamp"
    }
    b = {k: v for k, v in learnings.to_dict().items() if k != "timestamp"}
    assert a == b
    assert code in learnings.logs
    assert code in learnings.workspace