Fix failing tests. Add review step.

This commit is contained in:
Anton Osika
2023-06-25 14:00:06 +02:00
parent 9b86678d61
commit ba33e681df
5 changed files with 203 additions and 96 deletions

View File

@@ -1,73 +1,12 @@
import hashlib import hashlib
import json
import os import os
import random
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List from typing import List
from dataclasses_json import dataclass_json
from gpt_engineer import steps from gpt_engineer import steps
from gpt_engineer.db import DB, DBs from gpt_engineer.db import DBs
from gpt_engineer.steps import Step from gpt_engineer.domain import Step
from gpt_engineer.learning import Learning, extract_learning
@dataclass_json
@dataclass
class Learning:
model: str
temperature: float
steps: str
steps_file_hash: str
prompt: str
logs: str
workspace: str
feedback: str | None
session: str
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
version: str = "0.2"
def steps_file_hash():
with open(steps.__file__, "r") as f:
content = f.read()
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()
def logs_to_string(steps: List[Step], logs: DB):
chunks = []
for step in steps:
chunks.append(f"--- {step.__name__} ---\n")
messages = json.loads(logs[step.__name__])
chunks.append(format_messages(messages))
return "\n".join(chunks)
def format_messages(messages: List[dict]) -> str:
return "\n".join(
[f"{message['role']}:\n\n{message['content']}" for message in messages]
)
def extract_learning(
model: str, temperature: float, steps: List[Step], dbs: DBs
) -> Learning:
learning = Learning(
prompt=dbs.input["prompt"],
model=model,
temperature=temperature,
steps=json.dumps([step.__name__ for step in steps]),
steps_file_hash=steps_file_hash(),
feedback=dbs.input.get("feedback"),
session=get_session(),
logs=logs_to_string(steps, dbs.logs),
workspace=dbs.workspace["all_output.txt"],
)
return learning
def send_learning(learning: Learning): def send_learning(learning: Learning):
@@ -83,25 +22,18 @@ def send_learning(learning: Learning):
) )
def get_session():
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
try:
if path.exists():
user_id = path.read_text()
else:
# random uuid:
user_id = str(random.randint(0, 2**32))
path.write_text(user_id)
return user_id
except IOError:
return "ephemeral_" + str(random.randint(0, 2**32))
def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs): def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs):
if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]: if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]:
print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning") print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning")
return return
learnings = extract_learning(model, temperature, steps, dbs) learnings = extract_learning(
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
)
send_learning(learnings) send_learning(learnings)
def steps_file_hash():
with open(steps.__file__, "r") as f:
content = f.read()
return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest()

6
gpt_engineer/domain.py Normal file
View File

@@ -0,0 +1,6 @@
from typing import Callable, List, TypeVar
from gpt_engineer.ai import AI
from gpt_engineer.db import DBs
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])

144
gpt_engineer/learning.py Normal file
View File

@@ -0,0 +1,144 @@
import json
import random
import tempfile
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from dataclasses_json import dataclass_json
from termcolor import colored
from gpt_engineer.db import DB, DBs
from gpt_engineer.domain import Step
@dataclass_json
@dataclass
class Review:
ran: Optional[bool]
perfect: Optional[bool]
works: Optional[bool]
comments: str
raw: str
@dataclass_json
@dataclass
class Learning:
model: str
temperature: float
steps: str
steps_file_hash: str
prompt: str
logs: str
workspace: str
feedback: Optional[str]
session: str
review: Optional[Review]
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
version: str = "0.3"
TERM_CHOICES = (
colored("y", "green")
+ "/"
+ colored("n", "red")
+ "/"
+ colored("u", "yellow")
+ "(ncertain): "
)
def human_input() -> Review:
print()
print(
colored("To help gpt-engineer learn, please answer 3 questions:", "light_green")
)
print()
ran = input("Did the generated code run at all? " + TERM_CHOICES)
while ran not in ("y", "n", "u", ""):
ran = input("Invalid input. Please enter y, n, or u: ")
perfect = ""
useful = ""
if ran == "y":
perfect = input(
"Did the generated code do everything you wanted? " + TERM_CHOICES
)
while perfect not in ("y", "n", "u", ""):
perfect = input("Invalid input. Please enter y, n, or u: ")
if perfect != "y":
useful = input("Did the generated code do anything useful? " + TERM_CHOICES)
while useful not in ("y", "n", "u", ""):
useful = input("Invalid input. Please enter y, n, or u: ")
comments = ""
if perfect != "y":
comments = input(
"If you have time, please explain what was not working "
+ colored("(ok to leave blank)\n", "light_green")
)
print(colored("Thank you", "light_green"))
return Review(
raw=", ".join([ran, perfect, useful]),
ran={"y": True, "n": False, "u": None, "": None}[ran],
works={"y": True, "n": False, "u": None, "": None}[useful],
perfect={"y": True, "n": False, "u": None, "": None}[perfect],
comments=comments,
)
def logs_to_string(steps: List[Step], logs: DB):
chunks = []
for step in steps:
chunks.append(f"--- {step.__name__} ---\n")
messages = json.loads(logs[step.__name__])
chunks.append(format_messages(messages))
return "\n".join(chunks)
def format_messages(messages: List[dict]) -> str:
return "\n".join(
[f"{message['role']}:\n\n{message['content']}" for message in messages]
)
def extract_learning(
model: str, temperature: float, steps: List[Step], dbs: DBs, steps_file_hash
) -> Learning:
review = None
if "review" in dbs.memory:
review = Review.from_json(dbs.memory["review"]) # type: ignore
learning = Learning(
prompt=dbs.input["prompt"],
model=model,
temperature=temperature,
steps=json.dumps([step.__name__ for step in steps]),
steps_file_hash=steps_file_hash,
feedback=dbs.input.get("feedback"),
session=get_session(),
logs=logs_to_string(steps, dbs.logs),
workspace=dbs.workspace["all_output.txt"],
review=review,
)
return learning
def get_session():
path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt"
try:
if path.exists():
user_id = path.read_text()
else:
# random uuid:
user_id = str(random.randint(0, 2**32))
path.write_text(user_id)
return user_id
except IOError:
return "ephemeral_" + str(random.randint(0, 2**32))

View File

@@ -3,13 +3,14 @@ import re
import subprocess import subprocess
from enum import Enum from enum import Enum
from typing import Callable, List, TypeVar from typing import List
from termcolor import colored from termcolor import colored
from gpt_engineer.ai import AI from gpt_engineer.ai import AI
from gpt_engineer.chat_to_files import to_files from gpt_engineer.chat_to_files import to_files
from gpt_engineer.db import DBs from gpt_engineer.db import DBs
from gpt_engineer.learning import human_input
def setup_sys_prompt(dbs: DBs) -> str: def setup_sys_prompt(dbs: DBs) -> str:
@@ -34,8 +35,7 @@ def get_prompt(dbs: DBs) -> str:
return dbs.input["prompt"] return dbs.input["prompt"]
# All steps below have this signature # All steps below have the signature Step
Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]])
def simple_gen(ai: AI, dbs: DBs) -> List[dict]: def simple_gen(ai: AI, dbs: DBs) -> List[dict]:
@@ -54,6 +54,9 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
while True: while True:
messages = ai.next(messages, user_input) messages = ai.next(messages, user_input)
if messages[-1]["content"].strip() == "Nothing more to clarify.":
break
if messages[-1]["content"].strip().lower().startswith("no"): if messages[-1]["content"].strip().lower().startswith("no"):
print("Nothing more to clarify.") print("Nothing more to clarify.")
break break
@@ -77,7 +80,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]:
"Is anything else unclear? If yes, only answer in the form:\n" "Is anything else unclear? If yes, only answer in the form:\n"
"{remaining unclear areas} remaining questions.\n" "{remaining unclear areas} remaining questions.\n"
"{Next question}\n" "{Next question}\n"
'If everything is sufficiently clear, only answer "no".' 'If everything is sufficiently clear, only answer "Nothing more to clarify.".'
) )
print() print()
@@ -183,10 +186,11 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
print("Executing the code...") print("Executing the code...")
print() print()
print( print(
"\033[92m" # green color colored(
+ "Note: If it does not work as expected, consider running the code" "Note: If it does not work as expected, consider running the code"
+ " in another way than above." + " in another way than above.",
+ "\033[0m" "green",
)
) )
print() print()
print("You can press ctrl+c *once* to stop the execution.") print("You can press ctrl+c *once* to stop the execution.")
@@ -196,16 +200,16 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
try: try:
p.wait() p.wait()
except KeyboardInterrupt: except KeyboardInterrupt:
print("Stopping execution...")
print() print()
print("Stopping execution.")
print("Execution stopped.")
p.kill() p.kill()
print() print()
print("Execution stopped.")
return [] return []
def gen_entrypoint(ai, dbs): def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]:
messages = ai.start( messages = ai.start(
system=( system=(
"You will get information about a codebase that is currently on disk in " "You will get information about a codebase that is currently on disk in "
@@ -254,6 +258,12 @@ def fix_code(ai: AI, dbs: DBs):
return messages return messages
def human_review(ai: AI, dbs: DBs):
review = human_input()
dbs.memory["review"] = review.to_json() # type: ignore
return []
class Config(str, Enum): class Config(str, Enum):
DEFAULT = "default" DEFAULT = "default"
BENCHMARK = "benchmark" BENCHMARK = "benchmark"
@@ -273,6 +283,7 @@ STEPS = {
gen_clarified_code, gen_clarified_code,
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
human_review,
], ],
Config.BENCHMARK: [simple_gen, gen_entrypoint], Config.BENCHMARK: [simple_gen, gen_entrypoint],
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint], Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
@@ -282,6 +293,7 @@ STEPS = {
gen_code, gen_code,
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
human_review,
], ],
Config.TDD_PLUS: [ Config.TDD_PLUS: [
gen_spec, gen_spec,
@@ -290,12 +302,14 @@ STEPS = {
fix_code, fix_code,
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
human_review,
], ],
Config.CLARIFY: [ Config.CLARIFY: [
clarify, clarify,
gen_clarified_code, gen_clarified_code,
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
human_review,
], ],
Config.RESPEC: [ Config.RESPEC: [
gen_spec, gen_spec,
@@ -305,11 +319,12 @@ STEPS = {
fix_code, fix_code,
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
human_review,
], ],
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint], Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review],
Config.EXECUTE_ONLY: [execute_entrypoint], Config.EXECUTE_ONLY: [execute_entrypoint],
} }
# Future steps that can be added: # Future steps that can be added:
# run_tests_and_fix_files # run_tests_and_fix_files
# execute_entrypoint_and_fix_files_if_needed # execute_entrypoint_and_fix_files_if_it_results_in_error

View File

@@ -2,11 +2,13 @@ import json
import os import os
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest import pytest
import rudderstack.analytics as rudder_analytics import rudderstack.analytics as rudder_analytics
from gpt_engineer.collect import collect_learnings, extract_learning from gpt_engineer.collect import collect_learnings, steps_file_hash
from gpt_engineer.db import DB, DBs from gpt_engineer.db import DB, DBs
from gpt_engineer.learning import extract_learning
from gpt_engineer.steps import gen_code from gpt_engineer.steps import gen_code
@@ -28,10 +30,18 @@ def test_collect_learnings(monkeypatch):
collect_learnings(model, temperature, steps, dbs) collect_learnings(model, temperature, steps, dbs)
learnings = extract_learning(model, temperature, steps, dbs) learnings = extract_learning(
model, temperature, steps, dbs, steps_file_hash=steps_file_hash()
)
assert rudder_analytics.track.call_count == 1 assert rudder_analytics.track.call_count == 1
assert rudder_analytics.track.call_args[1]["event"] == "learning" assert rudder_analytics.track.call_args[1]["event"] == "learning"
assert rudder_analytics.track.call_args[1]["properties"] == learnings.to_dict() a = {
k: v
for k, v in rudder_analytics.track.call_args[1]["properties"].items()
if k != "timestamp"
}
b = {k: v for k, v in learnings.to_dict().items() if k != "timestamp"}
assert a == b
assert code in learnings.logs assert code in learnings.logs
assert code in learnings.workspace assert code in learnings.workspace