mirror of
https://github.com/aljazceru/gpt-engineer.git
synced 2025-12-17 12:45:26 +01:00
Fix failing tests. Add review step.
This commit is contained in:
@@ -1,73 +1,12 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from dataclasses_json import dataclass_json
|
|
||||||
|
|
||||||
from gpt_engineer import steps
|
from gpt_engineer import steps
|
||||||
from gpt_engineer.db import DB, DBs
|
from gpt_engineer.db import DBs
|
||||||
from gpt_engineer.steps import Step
|
from gpt_engineer.domain import Step
|
||||||
|
from gpt_engineer.learning import Learning, extract_learning
|
||||||
|
|
||||||
@dataclass_json
|
|
||||||
@dataclass
|
|
||||||
class Learning:
|
|
||||||
model: str
|
|
||||||
temperature: float
|
|
||||||
steps: str
|
|
||||||
steps_file_hash: str
|
|
||||||
prompt: str
|
|
||||||
logs: str
|
|
||||||
workspace: str
|
|
||||||
feedback: str | None
|
|
||||||
session: str
|
|
||||||
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
||||||
version: str = "0.2"
|
|
||||||
|
|
||||||
|
|
||||||
def steps_file_hash():
|
|
||||||
with open(steps.__file__, "r") as f:
|
|
||||||
content = f.read()
|
|
||||||
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def logs_to_string(steps: List[Step], logs: DB):
|
|
||||||
chunks = []
|
|
||||||
for step in steps:
|
|
||||||
chunks.append(f"--- {step.__name__} ---\n")
|
|
||||||
messages = json.loads(logs[step.__name__])
|
|
||||||
chunks.append(format_messages(messages))
|
|
||||||
return "\n".join(chunks)
|
|
||||||
|
|
||||||
|
|
||||||
def format_messages(messages: List[dict]) -> str:
|
|
||||||
return "\n".join(
|
|
||||||
[f"{message['role']}:\n\n{message['content']}" for message in messages]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_learning(
|
|
||||||
model: str, temperature: float, steps: List[Step], dbs: DBs
|
|
||||||
) -> Learning:
|
|
||||||
learning = Learning(
|
|
||||||
prompt=dbs.input["prompt"],
|
|
||||||
model=model,
|
|
||||||
temperature=temperature,
|
|
||||||
steps=json.dumps([step.__name__ for step in steps]),
|
|
||||||
steps_file_hash=steps_file_hash(),
|
|
||||||
feedback=dbs.input.get("feedback"),
|
|
||||||
session=get_session(),
|
|
||||||
logs=logs_to_string(steps, dbs.logs),
|
|
||||||
workspace=dbs.workspace["all_output.txt"],
|
|
||||||
)
|
|
||||||
return learning
|
|
||||||
|
|
||||||
|
|
||||||
def send_learning(learning: Learning):
|
def send_learning(learning: Learning):
|
||||||
@@ -83,25 +22,18 @@ def send_learning(learning: Learning):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_session():
|
|
||||||
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
|
|
||||||
|
|
||||||
try:
|
|
||||||
if path.exists():
|
|
||||||
user_id = path.read_text()
|
|
||||||
else:
|
|
||||||
# random uuid:
|
|
||||||
user_id = str(random.randint(0, 2**32))
|
|
||||||
path.write_text(user_id)
|
|
||||||
return user_id
|
|
||||||
except IOError:
|
|
||||||
return "ephemeral_" + str(random.randint(0, 2**32))
|
|
||||||
|
|
||||||
|
|
||||||
def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs):
|
def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs):
|
||||||
if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]:
|
if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]:
|
||||||
print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning")
|
print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning")
|
||||||
return
|
return
|
||||||
|
|
||||||
learnings = extract_learning(model, temperature, steps, dbs)
|
learnings = extract_learning(
|
||||||
|
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
|
||||||
|
)
|
||||||
send_learning(learnings)
|
send_learning(learnings)
|
||||||
|
|
||||||
|
|
||||||
|
def steps_file_hash():
|
||||||
|
with open(steps.__file__, "r") as f:
|
||||||
|
content = f.read()
|
||||||
|
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
|
||||||
|
|||||||
6
gpt_engineer/domain.py
Normal file
6
gpt_engineer/domain.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from typing import Callable, List, TypeVar
|
||||||
|
|
||||||
|
from gpt_engineer.ai import AI
|
||||||
|
from gpt_engineer.db import DBs
|
||||||
|
|
||||||
|
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
|
||||||
144
gpt_engineer/learning.py
Normal file
144
gpt_engineer/learning.py
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
import json
|
||||||
|
import random
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from dataclasses_json import dataclass_json
|
||||||
|
from termcolor import colored
|
||||||
|
|
||||||
|
from gpt_engineer.db import DB, DBs
|
||||||
|
from gpt_engineer.domain import Step
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass_json
|
||||||
|
@dataclass
|
||||||
|
class Review:
|
||||||
|
ran: Optional[bool]
|
||||||
|
perfect: Optional[bool]
|
||||||
|
works: Optional[bool]
|
||||||
|
comments: str
|
||||||
|
raw: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass_json
|
||||||
|
@dataclass
|
||||||
|
class Learning:
|
||||||
|
model: str
|
||||||
|
temperature: float
|
||||||
|
steps: str
|
||||||
|
steps_file_hash: str
|
||||||
|
prompt: str
|
||||||
|
logs: str
|
||||||
|
workspace: str
|
||||||
|
feedback: Optional[str]
|
||||||
|
session: str
|
||||||
|
review: Optional[Review]
|
||||||
|
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||||
|
version: str = "0.3"
|
||||||
|
|
||||||
|
|
||||||
|
TERM_CHOICES = (
|
||||||
|
colored("y", "green")
|
||||||
|
+ "/"
|
||||||
|
+ colored("n", "red")
|
||||||
|
+ "/"
|
||||||
|
+ colored("u", "yellow")
|
||||||
|
+ "(ncertain): "
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def human_input() -> Review:
|
||||||
|
print()
|
||||||
|
print(
|
||||||
|
colored("To help gpt-engineer learn, please answer 3 questions:", "light_green")
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
|
||||||
|
ran = input("Did the generated code run at all? " + TERM_CHOICES)
|
||||||
|
while ran not in ("y", "n", "u", ""):
|
||||||
|
ran = input("Invalid input. Please enter y, n, or u: ")
|
||||||
|
|
||||||
|
perfect = ""
|
||||||
|
useful = ""
|
||||||
|
|
||||||
|
if ran == "y":
|
||||||
|
perfect = input(
|
||||||
|
"Did the generated code do everything you wanted? " + TERM_CHOICES
|
||||||
|
)
|
||||||
|
while perfect not in ("y", "n", "u", ""):
|
||||||
|
perfect = input("Invalid input. Please enter y, n, or u: ")
|
||||||
|
|
||||||
|
if perfect != "y":
|
||||||
|
useful = input("Did the generated code do anything useful? " + TERM_CHOICES)
|
||||||
|
while useful not in ("y", "n", "u", ""):
|
||||||
|
useful = input("Invalid input. Please enter y, n, or u: ")
|
||||||
|
|
||||||
|
comments = ""
|
||||||
|
if perfect != "y":
|
||||||
|
comments = input(
|
||||||
|
"If you have time, please explain what was not working "
|
||||||
|
+ colored("(ok to leave blank)\n", "light_green")
|
||||||
|
)
|
||||||
|
print(colored("Thank you", "light_green"))
|
||||||
|
return Review(
|
||||||
|
raw=", ".join([ran, perfect, useful]),
|
||||||
|
ran={"y": True, "n": False, "u": None, "": None}[ran],
|
||||||
|
works={"y": True, "n": False, "u": None, "": None}[useful],
|
||||||
|
perfect={"y": True, "n": False, "u": None, "": None}[perfect],
|
||||||
|
comments=comments,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def logs_to_string(steps: List[Step], logs: DB):
|
||||||
|
chunks = []
|
||||||
|
for step in steps:
|
||||||
|
chunks.append(f"--- {step.__name__} ---\n")
|
||||||
|
messages = json.loads(logs[step.__name__])
|
||||||
|
chunks.append(format_messages(messages))
|
||||||
|
return "\n".join(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def format_messages(messages: List[dict]) -> str:
|
||||||
|
return "\n".join(
|
||||||
|
[f"{message['role']}:\n\n{message['content']}" for message in messages]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_learning(
|
||||||
|
model: str, temperature: float, steps: List[Step], dbs: DBs, steps_file_hash
|
||||||
|
) -> Learning:
|
||||||
|
review = None
|
||||||
|
if "review" in dbs.memory:
|
||||||
|
review = Review.from_json(dbs.memory["review"]) # type: ignore
|
||||||
|
learning = Learning(
|
||||||
|
prompt=dbs.input["prompt"],
|
||||||
|
model=model,
|
||||||
|
temperature=temperature,
|
||||||
|
steps=json.dumps([step.__name__ for step in steps]),
|
||||||
|
steps_file_hash=steps_file_hash,
|
||||||
|
feedback=dbs.input.get("feedback"),
|
||||||
|
session=get_session(),
|
||||||
|
logs=logs_to_string(steps, dbs.logs),
|
||||||
|
workspace=dbs.workspace["all_output.txt"],
|
||||||
|
review=review,
|
||||||
|
)
|
||||||
|
return learning
|
||||||
|
|
||||||
|
|
||||||
|
def get_session():
|
||||||
|
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
|
||||||
|
|
||||||
|
try:
|
||||||
|
if path.exists():
|
||||||
|
user_id = path.read_text()
|
||||||
|
else:
|
||||||
|
# random uuid:
|
||||||
|
user_id = str(random.randint(0, 2**32))
|
||||||
|
path.write_text(user_id)
|
||||||
|
return user_id
|
||||||
|
except IOError:
|
||||||
|
return "ephemeral_" + str(random.randint(0, 2**32))
|
||||||
@@ -3,13 +3,14 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Callable, List, TypeVar
|
from typing import List
|
||||||
|
|
||||||
from termcolor import colored
|
from termcolor import colored
|
||||||
|
|
||||||
from gpt_engineer.ai import AI
|
from gpt_engineer.ai import AI
|
||||||
from gpt_engineer.chat_to_files import to_files
|
from gpt_engineer.chat_to_files import to_files
|
||||||
from gpt_engineer.db import DBs
|
from gpt_engineer.db import DBs
|
||||||
|
from gpt_engineer.learning import human_input
|
||||||
|
|
||||||
|
|
||||||
def setup_sys_prompt(dbs: DBs) -> str:
|
def setup_sys_prompt(dbs: DBs) -> str:
|
||||||
@@ -34,8 +35,7 @@ def get_prompt(dbs: DBs) -> str:
|
|||||||
return dbs.input["prompt"]
|
return dbs.input["prompt"]
|
||||||
|
|
||||||
|
|
||||||
# All steps below have this signature
|
# All steps below have the signature Step
|
||||||
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
|
|
||||||
|
|
||||||
|
|
||||||
def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
|
def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
|
||||||
@@ -54,6 +54,9 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
|
|||||||
while True:
|
while True:
|
||||||
messages = ai.next(messages, user_input)
|
messages = ai.next(messages, user_input)
|
||||||
|
|
||||||
|
if messages[-1]["content"].strip() == "Nothing more to clarify.":
|
||||||
|
break
|
||||||
|
|
||||||
if messages[-1]["content"].strip().lower().startswith("no"):
|
if messages[-1]["content"].strip().lower().startswith("no"):
|
||||||
print("Nothing more to clarify.")
|
print("Nothing more to clarify.")
|
||||||
break
|
break
|
||||||
@@ -77,7 +80,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
|
|||||||
"Is anything else unclear? If yes, only answer in the form:\n"
|
"Is anything else unclear? If yes, only answer in the form:\n"
|
||||||
"{remaining unclear areas} remaining questions.\n"
|
"{remaining unclear areas} remaining questions.\n"
|
||||||
"{Next question}\n"
|
"{Next question}\n"
|
||||||
'If everything is sufficiently clear, only answer "no".'
|
'If everything is sufficiently clear, only answer "Nothing more to clarify.".'
|
||||||
)
|
)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
@@ -183,10 +186,11 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
|
|||||||
print("Executing the code...")
|
print("Executing the code...")
|
||||||
print()
|
print()
|
||||||
print(
|
print(
|
||||||
"\033[92m" # green color
|
colored(
|
||||||
+ "Note: If it does not work as expected, consider running the code"
|
"Note: If it does not work as expected, consider running the code"
|
||||||
+ " in another way than above."
|
+ " in another way than above.",
|
||||||
+ "\033[0m"
|
"green",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
print()
|
print()
|
||||||
print("You can press ctrl+c *once* to stop the execution.")
|
print("You can press ctrl+c *once* to stop the execution.")
|
||||||
@@ -196,16 +200,16 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
|
|||||||
try:
|
try:
|
||||||
p.wait()
|
p.wait()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("Stopping execution...")
|
|
||||||
print()
|
print()
|
||||||
|
print("Stopping execution.")
|
||||||
|
print("Execution stopped.")
|
||||||
p.kill()
|
p.kill()
|
||||||
print()
|
print()
|
||||||
print("Execution stopped.")
|
|
||||||
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def gen_entrypoint(ai, dbs):
|
def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
|
||||||
messages = ai.start(
|
messages = ai.start(
|
||||||
system=(
|
system=(
|
||||||
"You will get information about a codebase that is currently on disk in "
|
"You will get information about a codebase that is currently on disk in "
|
||||||
@@ -254,6 +258,12 @@ def fix_code(ai: AI, dbs: DBs):
|
|||||||
return messages
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
def human_review(ai: AI, dbs: DBs):
|
||||||
|
review = human_input()
|
||||||
|
dbs.memory["review"] = review.to_json() # type: ignore
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
class Config(str, Enum):
|
class Config(str, Enum):
|
||||||
DEFAULT = "default"
|
DEFAULT = "default"
|
||||||
BENCHMARK = "benchmark"
|
BENCHMARK = "benchmark"
|
||||||
@@ -273,6 +283,7 @@ STEPS = {
|
|||||||
gen_clarified_code,
|
gen_clarified_code,
|
||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
|
human_review,
|
||||||
],
|
],
|
||||||
Config.BENCHMARK: [simple_gen, gen_entrypoint],
|
Config.BENCHMARK: [simple_gen, gen_entrypoint],
|
||||||
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
|
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
|
||||||
@@ -282,6 +293,7 @@ STEPS = {
|
|||||||
gen_code,
|
gen_code,
|
||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
|
human_review,
|
||||||
],
|
],
|
||||||
Config.TDD_PLUS: [
|
Config.TDD_PLUS: [
|
||||||
gen_spec,
|
gen_spec,
|
||||||
@@ -290,12 +302,14 @@ STEPS = {
|
|||||||
fix_code,
|
fix_code,
|
||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
|
human_review,
|
||||||
],
|
],
|
||||||
Config.CLARIFY: [
|
Config.CLARIFY: [
|
||||||
clarify,
|
clarify,
|
||||||
gen_clarified_code,
|
gen_clarified_code,
|
||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
|
human_review,
|
||||||
],
|
],
|
||||||
Config.RESPEC: [
|
Config.RESPEC: [
|
||||||
gen_spec,
|
gen_spec,
|
||||||
@@ -305,11 +319,12 @@ STEPS = {
|
|||||||
fix_code,
|
fix_code,
|
||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
|
human_review,
|
||||||
],
|
],
|
||||||
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
|
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review],
|
||||||
Config.EXECUTE_ONLY: [execute_entrypoint],
|
Config.EXECUTE_ONLY: [execute_entrypoint],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Future steps that can be added:
|
# Future steps that can be added:
|
||||||
# run_tests_and_fix_files
|
# run_tests_and_fix_files
|
||||||
# execute_entrypoint_and_fix_files_if_needed
|
# execute_entrypoint_and_fix_files_if_it_results_in_error
|
||||||
|
|||||||
@@ -2,11 +2,13 @@ import json
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import rudderstack.analytics as rudder_analytics
|
import rudderstack.analytics as rudder_analytics
|
||||||
|
|
||||||
from gpt_engineer.collect import collect_learnings, extract_learning
|
from gpt_engineer.collect import collect_learnings, steps_file_hash
|
||||||
from gpt_engineer.db import DB, DBs
|
from gpt_engineer.db import DB, DBs
|
||||||
|
from gpt_engineer.learning import extract_learning
|
||||||
from gpt_engineer.steps import gen_code
|
from gpt_engineer.steps import gen_code
|
||||||
|
|
||||||
|
|
||||||
@@ -28,10 +30,18 @@ def test_collect_learnings(monkeypatch):
|
|||||||
|
|
||||||
collect_learnings(model, temperature, steps, dbs)
|
collect_learnings(model, temperature, steps, dbs)
|
||||||
|
|
||||||
learnings = extract_learning(model, temperature, steps, dbs)
|
learnings = extract_learning(
|
||||||
|
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
|
||||||
|
)
|
||||||
assert rudder_analytics.track.call_count == 1
|
assert rudder_analytics.track.call_count == 1
|
||||||
assert rudder_analytics.track.call_args[1]["event"] == "learning"
|
assert rudder_analytics.track.call_args[1]["event"] == "learning"
|
||||||
assert rudder_analytics.track.call_args[1]["properties"] == learnings.to_dict()
|
a = {
|
||||||
|
k: v
|
||||||
|
for k, v in rudder_analytics.track.call_args[1]["properties"].items()
|
||||||
|
if k != "timestamp"
|
||||||
|
}
|
||||||
|
b = {k: v for k, v in learnings.to_dict().items() if k != "timestamp"}
|
||||||
|
assert a == b
|
||||||
|
|
||||||
assert code in learnings.logs
|
assert code in learnings.logs
|
||||||
assert code in learnings.workspace
|
assert code in learnings.workspace
|
||||||
|
|||||||
Reference in New Issue
Block a user