From ba33e681df3efabf779ac180483910ec03e157b0 Mon Sep 17 00:00:00 2001 From: Anton Osika Date: Sun, 25 Jun 2023 14:00:06 +0200 Subject: [PATCH] Fix failing tests. Add review step. --- gpt_engineer/collect.py | 92 ++++--------------------- gpt_engineer/domain.py | 6 ++ gpt_engineer/learning.py | 144 +++++++++++++++++++++++++++++++++++++++ gpt_engineer/steps.py | 41 +++++++---- tests/test_collect.py | 16 ++++- 5 files changed, 203 insertions(+), 96 deletions(-) create mode 100644 gpt_engineer/domain.py create mode 100644 gpt_engineer/learning.py diff --git a/gpt_engineer/collect.py b/gpt_engineer/collect.py index 090ed0d..118e556 100644 --- a/gpt_engineer/collect.py +++ b/gpt_engineer/collect.py @@ -1,73 +1,12 @@ import hashlib -import json import os -import random -import tempfile -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path from typing import List -from dataclasses_json import dataclass_json - from gpt_engineer import steps -from gpt_engineer.db import DB, DBs -from gpt_engineer.steps import Step - - -@dataclass_json -@dataclass -class Learning: - model: str - temperature: float - steps: str - steps_file_hash: str - prompt: str - logs: str - workspace: str - feedback: str | None - session: str - timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - version: str = "0.2" - - -def steps_file_hash(): - with open(steps.__file__, "r") as f: - content = f.read() - return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest() - - -def logs_to_string(steps: List[Step], logs: DB): - chunks = [] - for step in steps: - chunks.append(f"--- {step.__name__} ---\n") - messages = json.loads(logs[step.__name__]) - chunks.append(format_messages(messages)) - return "\n".join(chunks) - - -def format_messages(messages: List[dict]) -> str: - return "\n".join( - [f"{message['role']}:\n\n{message['content']}" for message in messages] - ) - - -def extract_learning( - model: str, temperature: float, steps: List[Step], dbs: DBs -) -> Learning: - learning = Learning( - prompt=dbs.input["prompt"], - model=model, - temperature=temperature, - steps=json.dumps([step.__name__ for step in steps]), - steps_file_hash=steps_file_hash(), - feedback=dbs.input.get("feedback"), - session=get_session(), - logs=logs_to_string(steps, dbs.logs), - workspace=dbs.workspace["all_output.txt"], - ) - return learning +from gpt_engineer.db import DBs +from gpt_engineer.domain import Step +from gpt_engineer.learning import Learning, extract_learning def send_learning(learning: Learning): @@ -83,25 +22,18 @@ def send_learning(learning: Learning): ) -def get_session(): - path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt" - - try: - if path.exists(): - user_id = path.read_text() - else: - # random uuid: - user_id = str(random.randint(0, 2**32)) - path.write_text(user_id) - return user_id - except IOError: - return "ephemeral_" + str(random.randint(0, 2**32)) - - def collect_learnings(model: str, temperature: float, steps: List[Step], dbs: DBs): if os.environ.get("COLLECT_LEARNINGS_OPT_OUT") in ["true", "1"]: print("COLLECT_LEARNINGS_OPT_OUT is set to true, not collecting learning") return - learnings = extract_learning(model, temperature, steps, dbs) + learnings = extract_learning( + model, temperature, steps, dbs, steps_file_hash=steps_file_hash() + ) send_learning(learnings) + + +def steps_file_hash(): + with open(steps.__file__, "r") as f: + content = f.read() + return hashlib.sha256(content.encode("utf-8"), usedforsecurity=False).hexdigest() diff --git a/gpt_engineer/domain.py b/gpt_engineer/domain.py new file mode 100644 index 0000000..ecaf861 --- /dev/null +++ b/gpt_engineer/domain.py @@ -0,0 +1,6 @@ +from typing import Callable, List, TypeVar + +from gpt_engineer.ai import AI +from gpt_engineer.db import DBs + +Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]]) diff --git a/gpt_engineer/learning.py b/gpt_engineer/learning.py new file mode 100644 index 0000000..3d24e46 --- /dev/null +++ b/gpt_engineer/learning.py @@ -0,0 +1,144 @@ +import json +import random +import tempfile + +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import List, Optional + +from dataclasses_json import dataclass_json +from termcolor import colored + +from gpt_engineer.db import DB, DBs +from gpt_engineer.domain import Step + + +@dataclass_json +@dataclass +class Review: + ran: Optional[bool] + perfect: Optional[bool] + works: Optional[bool] + comments: str + raw: str + + +@dataclass_json +@dataclass +class Learning: + model: str + temperature: float + steps: str + steps_file_hash: str + prompt: str + logs: str + workspace: str + feedback: Optional[str] + session: str + review: Optional[Review] + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + version: str = "0.3" + + +TERM_CHOICES = ( + colored("y", "green") + + "/" + + colored("n", "red") + + "/" + + colored("u", "yellow") + + "(ncertain): " +) + + +def human_input() -> Review: + print() + print( + colored("To help gpt-engineer learn, please answer 3 questions:", "light_green") + ) + print() + + ran = input("Did the generated code run at all? " + TERM_CHOICES) + while ran not in ("y", "n", "u", ""): + ran = input("Invalid input. Please enter y, n, or u: ") + + perfect = "" + useful = "" + + if ran == "y": + perfect = input( + "Did the generated code do everything you wanted? " + TERM_CHOICES + ) + while perfect not in ("y", "n", "u", ""): + perfect = input("Invalid input. Please enter y, n, or u: ") + + if perfect != "y": + useful = input("Did the generated code do anything useful? " + TERM_CHOICES) + while useful not in ("y", "n", "u", ""): + useful = input("Invalid input. Please enter y, n, or u: ") + + comments = "" + if perfect != "y": + comments = input( + "If you have time, please explain what was not working " + + colored("(ok to leave blank)\n", "light_green") + ) + print(colored("Thank you", "light_green")) + return Review( + raw=", ".join([ran, perfect, useful]), + ran={"y": True, "n": False, "u": None, "": None}[ran], + works={"y": True, "n": False, "u": None, "": None}[useful], + perfect={"y": True, "n": False, "u": None, "": None}[perfect], + comments=comments, + ) + + +def logs_to_string(steps: List[Step], logs: DB): + chunks = [] + for step in steps: + chunks.append(f"--- {step.__name__} ---\n") + messages = json.loads(logs[step.__name__]) + chunks.append(format_messages(messages)) + return "\n".join(chunks) + + +def format_messages(messages: List[dict]) -> str: + return "\n".join( + [f"{message['role']}:\n\n{message['content']}" for message in messages] + ) + + +def extract_learning( + model: str, temperature: float, steps: List[Step], dbs: DBs, steps_file_hash +) -> Learning: + review = None + if "review" in dbs.memory: + review = Review.from_json(dbs.memory["review"]) # type: ignore + learning = Learning( + prompt=dbs.input["prompt"], + model=model, + temperature=temperature, + steps=json.dumps([step.__name__ for step in steps]), + steps_file_hash=steps_file_hash, + feedback=dbs.input.get("feedback"), + session=get_session(), + logs=logs_to_string(steps, dbs.logs), + workspace=dbs.workspace["all_output.txt"], + review=review, + ) + return learning + + +def get_session(): + path = Path(tempfile.gettempdir()) / "gpt_engineer_user_id.txt" + + try: + if path.exists(): + user_id = path.read_text() + else: + # random uuid: + user_id = str(random.randint(0, 2**32)) + path.write_text(user_id) + return user_id + except IOError: + return "ephemeral_" + str(random.randint(0, 2**32)) diff --git a/gpt_engineer/steps.py b/gpt_engineer/steps.py index 41530ea..9e26b71 100644 --- a/gpt_engineer/steps.py +++ b/gpt_engineer/steps.py @@ -3,13 +3,14 @@ import re import subprocess from enum import Enum -from typing import Callable, List, TypeVar +from typing import List from termcolor import colored from gpt_engineer.ai import AI from gpt_engineer.chat_to_files import to_files from gpt_engineer.db import DBs +from gpt_engineer.learning import human_input def setup_sys_prompt(dbs: DBs) -> str: @@ -34,8 +35,7 @@ def get_prompt(dbs: DBs) -> str: return dbs.input["prompt"] -# All steps below have this signature -Step = TypeVar("Step", bound=Callable[[AI, DBs], List[dict]]) +# All steps below have the signature Step def simple_gen(ai: AI, dbs: DBs) -> List[dict]: @@ -54,6 +54,9 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]: while True: messages = ai.next(messages, user_input) + if messages[-1]["content"].strip() == "Nothing more to clarify.": + break + if messages[-1]["content"].strip().lower().startswith("no"): print("Nothing more to clarify.") break @@ -77,7 +80,7 @@ def clarify(ai: AI, dbs: DBs) -> List[dict]: "Is anything else unclear? If yes, only answer in the form:\n" "{remaining unclear areas} remaining questions.\n" "{Next question}\n" - 'If everything is sufficiently clear, only answer "no".' + 'If everything is sufficiently clear, only answer "Nothing more to clarify.".' ) print() @@ -183,10 +186,11 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]: print("Executing the code...") print() print( - "\033[92m" # green color - + "Note: If it does not work as expected, consider running the code" - + " in another way than above." - + "\033[0m" + colored( + "Note: If it does not work as expected, consider running the code" + + " in another way than above.", + "green", + ) ) print() print("You can press ctrl+c *once* to stop the execution.") @@ -196,16 +200,16 @@ def execute_entrypoint(ai: AI, dbs: DBs) -> List[dict]: try: p.wait() except KeyboardInterrupt: - print("Stopping execution...") print() + print("Stopping execution.") + print("Execution stopped.") p.kill() print() - print("Execution stopped.") return [] -def gen_entrypoint(ai, dbs): +def gen_entrypoint(ai: AI, dbs: DBs) -> List[dict]: messages = ai.start( system=( "You will get information about a codebase that is currently on disk in " @@ -254,6 +258,12 @@ def fix_code(ai: AI, dbs: DBs): return messages +def human_review(ai: AI, dbs: DBs): + review = human_input() + dbs.memory["review"] = review.to_json() # type: ignore + return [] + + class Config(str, Enum): DEFAULT = "default" BENCHMARK = "benchmark" @@ -273,6 +283,7 @@ STEPS = { gen_clarified_code, gen_entrypoint, execute_entrypoint, + human_review, ], Config.BENCHMARK: [simple_gen, gen_entrypoint], Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint], @@ -282,6 +293,7 @@ STEPS = { gen_code, gen_entrypoint, execute_entrypoint, + human_review, ], Config.TDD_PLUS: [ gen_spec, @@ -290,12 +302,14 @@ STEPS = { fix_code, gen_entrypoint, execute_entrypoint, + human_review, ], Config.CLARIFY: [ clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint, + human_review, ], Config.RESPEC: [ gen_spec, @@ -305,11 +319,12 @@ STEPS = { fix_code, gen_entrypoint, execute_entrypoint, + human_review, ], - Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint], + Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint, human_review], Config.EXECUTE_ONLY: [execute_entrypoint], } # Future steps that can be added: # run_tests_and_fix_files -# execute_entrypoint_and_fix_files_if_needed +# execute_entrypoint_and_fix_files_if_it_results_in_error diff --git a/tests/test_collect.py b/tests/test_collect.py index 76fabde..761e002 100644 --- a/tests/test_collect.py +++ b/tests/test_collect.py @@ -2,11 +2,13 @@ import json import os from unittest.mock import MagicMock + import pytest import rudderstack.analytics as rudder_analytics -from gpt_engineer.collect import collect_learnings, extract_learning +from gpt_engineer.collect import collect_learnings, steps_file_hash from gpt_engineer.db import DB, DBs +from gpt_engineer.learning import extract_learning from gpt_engineer.steps import gen_code @@ -28,10 +30,18 @@ def test_collect_learnings(monkeypatch): collect_learnings(model, temperature, steps, dbs) - learnings = extract_learning(model, temperature, steps, dbs) + learnings = extract_learning( + model, temperature, steps, dbs, steps_file_hash=steps_file_hash() + ) assert rudder_analytics.track.call_count == 1 assert rudder_analytics.track.call_args[1]["event"] == "learning" - assert rudder_analytics.track.call_args[1]["properties"] == learnings.to_dict() + a = { + k: v + for k, v in rudder_analytics.track.call_args[1]["properties"].items() + if k != "timestamp" + } + b = {k: v for k, v in learnings.to_dict().items() if k != "timestamp"} + assert a == b assert code in learnings.logs assert code in learnings.workspace