gpt-engineer/gpt_engineer/chat_to_files.py

import re
from typing import List, Tuple
from gpt_engineer.db import DB

# Amount of lines within the code block to consider for filename discovery
N_CODELINES_FOR_FILENAME_TA = 5

# Default path to use if no filename is found
DEFAULT_PATH = 'unknown.txt'


def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
    '''
    Parses a chat message and returns a list of tuples containing
    the file path and the code content for each file.
    '''
    code_regex = r'```(.*?)```'
    filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'

    # Get all ``` (code) blocks
    code_matches = re.finditer(code_regex, chat, re.DOTALL)

    prev_code_y_end = 0
    files = []
    for match in code_matches:
        lines = match.group(1).split('\n')
        code_y_start = match.start()
        code_y_end = match.end()

        # Now, we need to get the filename associated with this code block.
        # We will look for the filename somewhere near the code block start.
        #
        # This "somewhere near" is referred to as the "filename_ta", to
        # resemble a sort-of target area (ta).
        #
        # The target area includes the text preceding the code block that
        # does not belong to previous code blocks ("no_code").
        # Additionally, as sometimes the filename is defined within
        # the code block itself, we will also include the first few lines
        # of the code block in the filename_ta.
        #
        # Example:
        # ```python
        # # File: entrypoint.py
        # import pygame
        # ```
        #
        # The amount of lines to consider within the code block is set by
        # the constant 'N_CODELINES_FOR_FILENAME_TA'.
        #
        # Get the "preceding" text, which is located between codeblocks
        no_code = chat[prev_code_y_end:code_y_start].strip()
        within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
        filename_ta = no_code + '\n' + within_code

        # The path is the filename itself which we greedily match
        filename = re.search(filename_regex, filename_ta)
        path = filename.group(0) if filename else DEFAULT_PATH

        # Visualize the filename_ta if verbose
        if verbose:
            print('-' * 20)
            print(f'Path: {path}')
            print('-' * 20)
            print(filename_ta)
            print('-' * 20)

        # Check if its not a false positive
        #
        # For instance, the match with ```main.py``` should not be considered.
        # ```main.py```
        # ```python
        # ...
        # ```
        if not re.fullmatch(filename_regex, '\n'.join(lines)):
            # Update the previous code block end
            prev_code_y_end = code_y_end

            # File and code have been matched, add them to the list
            files.append((path, '\n'.join(lines[1:])))

    return files


def to_files(chat: str, workspace: DB):
    workspace["all_output.txt"] = chat

    files = parse_chat(chat)
    for file_name, file_content in files:
        workspace[file_name] = file_content