mirror of
https://github.com/aljazceru/gpt-engineer.git
synced 2025-12-18 05:05:16 +01:00
91 lines
3.0 KiB
Python
91 lines
3.0 KiB
Python
import re
|
|
from typing import List, Tuple
|
|
from gpt_engineer.db import DB
|
|
|
|
# Amount of lines within the code block to consider for filename discovery
|
|
N_CODELINES_FOR_FILENAME_TA = 5
|
|
|
|
# Default path to use if no filename is found
|
|
DEFAULT_PATH = 'unknown.txt'
|
|
|
|
|
|
def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
|
|
'''
|
|
Parses a chat message and returns a list of tuples containing
|
|
the file path and the code content for each file.
|
|
'''
|
|
code_regex = r'```(.*?)```'
|
|
filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'
|
|
|
|
# Get all ``` (code) blocks
|
|
code_matches = re.finditer(code_regex, chat, re.DOTALL)
|
|
|
|
prev_code_y_end = 0
|
|
files = []
|
|
for match in code_matches:
|
|
lines = match.group(1).split('\n')
|
|
code_y_start = match.start()
|
|
code_y_end = match.end()
|
|
|
|
# Now, we need to get the filename associated with this code block.
|
|
# We will look for the filename somewhere near the code block start.
|
|
#
|
|
# This "somewhere near" is referred to as the "filename_ta", to
|
|
# resemble a sort-of target area (ta).
|
|
#
|
|
# The target area includes the text preceding the code block that
|
|
# does not belong to previous code blocks ("no_code").
|
|
# Additionally, as sometimes the filename is defined within
|
|
# the code block itself, we will also include the first few lines
|
|
# of the code block in the filename_ta.
|
|
#
|
|
# Example:
|
|
# ```python
|
|
# # File: entrypoint.py
|
|
# import pygame
|
|
# ```
|
|
#
|
|
# The amount of lines to consider within the code block is set by
|
|
# the constant 'N_CODELINES_FOR_FILENAME_TA'.
|
|
#
|
|
# Get the "preceding" text, which is located between codeblocks
|
|
no_code = chat[prev_code_y_end:code_y_start].strip()
|
|
within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
|
|
filename_ta = no_code + '\n' + within_code
|
|
|
|
# The path is the filename itself which we greedily match
|
|
filename = re.search(filename_regex, filename_ta)
|
|
path = filename.group(0) if filename else DEFAULT_PATH
|
|
|
|
# Visualize the filename_ta if verbose
|
|
if verbose:
|
|
print('-' * 20)
|
|
print(f'Path: {path}')
|
|
print('-' * 20)
|
|
print(filename_ta)
|
|
print('-' * 20)
|
|
|
|
# Check if its not a false positive
|
|
#
|
|
# For instance, the match with ```main.py``` should not be considered.
|
|
# ```main.py```
|
|
# ```python
|
|
# ...
|
|
# ```
|
|
if not re.fullmatch(filename_regex, '\n'.join(lines)):
|
|
# Update the previous code block end
|
|
prev_code_y_end = code_y_end
|
|
|
|
# File and code have been matched, add them to the list
|
|
files.append((path, '\n'.join(lines[1:])))
|
|
|
|
return files
|
|
|
|
|
|
def to_files(chat: str, workspace: DB):
|
|
workspace["all_output.txt"] = chat
|
|
|
|
files = parse_chat(chat)
|
|
for file_name, file_content in files:
|
|
workspace[file_name] = file_content
|