Fix failing tests. Add review step.

This commit is contained in:
Anton Osika
2023-06-25 14:00:06 +02:00
parent 9b86678d61
commit ba33e681df
5 changed files with 203 additions and 96 deletions

View File

@@ -1,73 +1,12 @@
import hashlib
import json
import os
import random
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List
from dataclasses_json import dataclass_json
from gpt_engineer import steps
from gpt_engineer.db import DB, DBs
from gpt_engineer.steps import Step
@dataclass_json
@dataclass
class Learning:
model: str
temperature: float
steps: str
steps_file_hash: str
prompt: str
logs: str
workspace: str
feedback: str | None
session: str
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
version: str = "0.2"
def steps_file_hash():
with open(steps.__file__, "r") as f:
content = f.read()
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
def logs_to_string(steps: List[Step], logs: DB):
chunks = []
for step in steps:
chunks.append(f"--- {step.__name__} ---\n")
messages = json.loads(logs[step.__name__])
chunks.append(format_messages(messages))
return "\n".join(chunks)
def format_messages(messages: List[dict]) -> str:
return "\n".join(
[f"{message['role']}:\n\n{message['content']}" for message in messages]
)
def extract_learning(
model: str, temperature: float, steps: List[Step], dbs: DBs
) -> Learning:
learning = Learning(
prompt=dbs.input["prompt"],
model=model,
temperature=temperature,
steps=json.dumps([step.__name__ for step in steps]),
steps_file_hash=steps_file_hash(),
feedback=dbs.input.get("feedback"),
session=get_session(),
logs=logs_to_string(steps, dbs.logs),
workspace=dbs.workspace["all_output.txt"],
)
return learning
from gpt_engineer.db import DBs
from gpt_engineer.domain import Step
from gpt_engineer.learning import Learning, extract_learning
def send_learning(learning: Learning):
@@ -83,25 +22,18 @@ def send_learning(learning: Learning):
)
def get_session():
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
try:
if path.exists():
user_id = path.read_text()
else:
# random uuid:
user_id = str(random.randint(0, 2**32))
path.write_text(user_id)
return user_id
except IOError:
return "ephemeral_" + str(random.randint(0, 2**32))
def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs):
if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]:
print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning")
return
learnings = extract_learning(model, temperature, steps, dbs)
learnings = extract_learning(
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
)
send_learning(learnings)
def steps_file_hash():
with open(steps.__file__, "r") as f:
content = f.read()
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()

6
gpt_engineer/domain.py Normal file
View File

@@ -0,0 +1,6 @@
from typing import Callable, List, TypeVar
from gpt_engineer.ai import AI
from gpt_engineer.db import DBs
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])

144
gpt_engineer/learning.py Normal file
View File

@@ -0,0 +1,144 @@
import json
import random
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from dataclasses_json import dataclass_json
from termcolor import colored
from gpt_engineer.db import DB, DBs
from gpt_engineer.domain import Step
@dataclass_json
@dataclass
class Review:
ran: Optional[bool]
perfect: Optional[bool]
works: Optional[bool]
comments: str
raw: str
@dataclass_json
@dataclass
class Learning:
model: str
temperature: float
steps: str
steps_file_hash: str
prompt: str
logs: str
workspace: str
feedback: Optional[str]
session: str
review: Optional[Review]
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
version: str = "0.3"
TERM_CHOICES = (
colored("y", "green")
+ "/"
+ colored("n", "red")
+ "/"
+ colored("u", "yellow")
+ "(ncertain): "
)
def human_input() -> Review:
print()
print(
colored("To help gpt-engineer learn, please answer 3 questions:", "light_green")
)
print()
ran = input("Did the generated code run at all? " + TERM_CHOICES)
while ran not in ("y", "n", "u", ""):
ran = input("Invalid input. Please enter y, n, or u: ")
perfect = ""
useful = ""
if ran == "y":
perfect = input(
"Did the generated code do everything you wanted? " + TERM_CHOICES
)
while perfect not in ("y", "n", "u", ""):
perfect = input("Invalid input. Please enter y, n, or u: ")
if perfect != "y":
useful = input("Did the generated code do anything useful? " + TERM_CHOICES)
while useful not in ("y", "n", "u", ""):
useful = input("Invalid input. Please enter y, n, or u: ")
comments = ""
if perfect != "y":
comments = input(
"If you have time, please explain what was not working "
+ colored("(ok to leave blank)\n", "light_green")
)
print(colored("Thank you", "light_green"))
return Review(
raw=", ".join([ran, perfect, useful]),
ran={"y": True, "n": False, "u": None, "": None}[ran],
works={"y": True, "n": False, "u": None, "": None}[useful],
perfect={"y": True, "n": False, "u": None, "": None}[perfect],
comments=comments,
)
def logs_to_string(steps: List[Step], logs: DB):
chunks = []
for step in steps:
chunks.append(f"--- {step.__name__} ---\n")
messages = json.loads(logs[step.__name__])
chunks.append(format_messages(messages))
return "\n".join(chunks)
def format_messages(messages: List[dict]) -> str:
return "\n".join(
[f"{message['role']}:\n\n{message['content']}" for message in messages]
)
def extract_learning(
model: str, temperature: float, steps: List[Step], dbs: DBs, steps_file_hash
) -> Learning:
review = None
if "review" in dbs.memory:
review = Review.from_json(dbs.memory["review"]) # type: ignore
learning = Learning(
prompt=dbs.input["prompt"],
model=model,
temperature=temperature,
steps=json.dumps([step.__name__ for step in steps]),
steps_file_hash=steps_file_hash,
feedback=dbs.input.get("feedback"),
session=get_session(),
logs=logs_to_string(steps, dbs.logs),
workspace=dbs.workspace["all_output.txt"],
review=review,
)
return learning
def get_session():
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
try:
if path.exists():
user_id = path.read_text()
else:
# random uuid:
user_id = str(random.randint(0, 2**32))
path.write_text(user_id)
return user_id
except IOError:
return "ephemeral_" + str(random.randint(0, 2**32))

View File

@@ -3,13 +3,14 @@ import re
import subprocess
from enum import Enum
from typing import Callable, List, TypeVar
from typing import List
from termcolor import colored
from gpt_engineer.ai import AI
from gpt_engineer.chat_to_files import to_files
from gpt_engineer.db import DBs
from gpt_engineer.learning import human_input
def setup_sys_prompt(dbs: DBs) -> str:
@@ -34,8 +35,7 @@ def get_prompt(dbs: DBs) -> str:
return dbs.input["prompt"]
# All steps below have this signature
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
# All steps below have the signature Step
def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
@@ -54,6 +54,9 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
while True:
messages = ai.next(messages, user_input)
if messages[-1]["content"].strip() == "Nothing more to clarify.":
break
if messages[-1]["content"].strip().lower().startswith("no"):
print("Nothing more to clarify.")
break
@@ -77,7 +80,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
"Is anything else unclear? If yes, only answer in the form:\n"
"{remaining unclear areas} remaining questions.\n"
"{Next question}\n"
'If everything is sufficiently clear, only answer "no".'
'If everything is sufficiently clear, only answer "Nothing more to clarify.".'
)
print()
@@ -183,10 +186,11 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
print("Executing the code...")
print()
print(
"\033[92m" # green color
+ "Note: If it does not work as expected, consider running the code"
+ " in another way than above."
+ "\033[0m"
colored(
"Note: If it does not work as expected, consider running the code"
+ " in another way than above.",
"green",
)
)
print()
print("You can press ctrl+c *once* to stop the execution.")
@@ -196,16 +200,16 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
try:
p.wait()
except KeyboardInterrupt:
print("Stopping execution...")
print()
print("Stopping execution.")
print("Execution stopped.")
p.kill()
print()
print("Execution stopped.")
return []
def gen_entrypoint(ai, dbs):
def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
messages = ai.start(
system=(
"You will get information about a codebase that is currently on disk in "
@@ -254,6 +258,12 @@ def fix_code(ai: AI, dbs: DBs):
return messages
def human_review(ai: AI, dbs: DBs):
review = human_input()
dbs.memory["review"] = review.to_json() # type: ignore
return []
class Config(str, Enum):
DEFAULT = "default"
BENCHMARK = "benchmark"
@@ -273,6 +283,7 @@ STEPS = {
gen_clarified_code,
gen_entrypoint,
execute_entrypoint,
human_review,
],
Config.BENCHMARK: [simple_gen, gen_entrypoint],
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
@@ -282,6 +293,7 @@ STEPS = {
gen_code,
gen_entrypoint,
execute_entrypoint,
human_review,
],
Config.TDD_PLUS: [
gen_spec,
@@ -290,12 +302,14 @@ STEPS = {
fix_code,
gen_entrypoint,
execute_entrypoint,
human_review,
],
Config.CLARIFY: [
clarify,
gen_clarified_code,
gen_entrypoint,
execute_entrypoint,
human_review,
],
Config.RESPEC: [
gen_spec,
@@ -305,11 +319,12 @@ STEPS = {
fix_code,
gen_entrypoint,
execute_entrypoint,
human_review,
],
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review],
Config.EXECUTE_ONLY: [execute_entrypoint],
}
# Future steps that can be added:
# run_tests_and_fix_files
# execute_entrypoint_and_fix_files_if_needed
# execute_entrypoint_and_fix_files_if_it_results_in_error

View File

@@ -2,11 +2,13 @@ import json
import os
from unittest.mock import MagicMock
import pytest
import rudderstack.analytics as rudder_analytics
from gpt_engineer.collect import collect_learnings, extract_learning
from gpt_engineer.collect import collect_learnings, steps_file_hash
from gpt_engineer.db import DB, DBs
from gpt_engineer.learning import extract_learning
from gpt_engineer.steps import gen_code
@@ -28,10 +30,18 @@ def test_collect_learnings(monkeypatch):
collect_learnings(model, temperature, steps, dbs)
learnings = extract_learning(model, temperature, steps, dbs)
learnings = extract_learning(
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
)
assert rudder_analytics.track.call_count == 1
assert rudder_analytics.track.call_args[1]["event"] == "learning"
assert rudder_analytics.track.call_args[1]["properties"] == learnings.to_dict()
a = {
k: v
for k, v in rudder_analytics.track.call_args[1]["properties"].items()
if k != "timestamp"
}
b = {k: v for k, v in learnings.to_dict().items() if k != "timestamp"}
assert a == b
assert code in learnings.logs
assert code in learnings.workspace