Revert "Improved chat parsing with no AI logic (#120)"

This reverts commit 8facedd8d1.
2025-12-18 21:25:11 +01:00 · 2023-06-18 14:25:49 +02:00
parent 8facedd8d1
commit c999f7c2c8
2 changed files with 11 additions and 201 deletions
--- a/gpt_engineer/chat_to_files.py
+++ b/gpt_engineer/chat_to_files.py
@@ -2,82 +2,21 @@ import re
 from typing import List, Tuple
 from gpt_engineer.db import DB
 # Amount of lines within the code block to consider for filename discovery
 N_CODELINES_FOR_FILENAME_TA = 5
-# Default path to use if no filename is found
+def parse_chat(chat) -> List[Tuple[str, str]]:
-DEFAULT_PATH = 'unknown.txt'
+    # Get all ``` blocks
    regex = r"```(.*?)```"
    matches = re.finditer(regex, chat, re.DOTALL)
 def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
    '''
    Parses a chat message and returns a list of tuples containing
    the file path and the code content for each file.
    '''
    code_regex = r'```(.*?)```'
    filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'
    # Get all ``` (code) blocks
    code_matches = re.finditer(code_regex, chat, re.DOTALL)
    prev_code_y_end = 0
    files = []
-    for match in code_matches:
+    for match in matches:
-        lines = match.group(1).split('\n')
+        path = match.group(1).split("\n")[0]
-        code_y_start = match.start()
+        # Get the code
-        code_y_end = match.end()
+        code = match.group(1).split("\n")[1:]
-
+        code = "\n".join(code)
-        # Now, we need to get the filename associated with this code block.
+        # Add the file to the list
-        # We will look for the filename somewhere near the code block start.
+        files.append((path, code))
        #
        # This "somewhere near" is referred to as the "filename_ta", to
        # resemble a sort-of target area (ta).
        #
        # The target area includes the text preceding the code block that
        # does not belong to previous code blocks ("no_code").
        # Additionally, as sometimes the filename is defined within
        # the code block itself, we will also include the first few lines
        # of the code block in the filename_ta.
        #
        # Example:
        # ```python
        # # File: entrypoint.py
        # import pygame
        # ```
        #
        # The amount of lines to consider within the code block is set by
        # the constant 'N_CODELINES_FOR_FILENAME_TA'.
        #
        # Get the "preceding" text, which is located between codeblocks
        no_code = chat[prev_code_y_end:code_y_start].strip()
        within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
        filename_ta = no_code + '\n' + within_code
        # The path is the filename itself which we greedily match
        filename = re.search(filename_regex, filename_ta)
        path = filename.group(0) if filename else DEFAULT_PATH
        # Visualize the filename_ta if verbose
        if verbose:
            print('-' * 20)
            print(f'Path: {path}')
            print('-' * 20)
            print(filename_ta)
            print('-' * 20)
        # Check if its not a false positive
        #
        # For instance, the match with ```main.py``` should not be considered.
        # ```main.py```
        # ```python
        # ...
        # ```
        if not re.fullmatch(filename_regex, '\n'.join(lines)):
            # Update the previous code block end
            prev_code_y_end = code_y_end
            # File and code have been matched, add them to the list
            files.append((path, '\n'.join(lines[1:])))
    return files
--- a/tests/test_chat_parser.py
+++ b/tests/test_chat_parser.py
@@ -1,129 +0,0 @@
 import unittest
 from gpt_engineer.chat_to_files import parse_chat
 CODE_FORMATS = '''
 (1)
 File: main.py
 ```python
 import pygame
 ````
 (2)
 entry.py
 ```python
 import pygame
 ```
 (3)
 ```python
 # File: rickroll.py
 import pygame
 ```
 (4)
 ```python
 # File: engineer.py
 import pygame
 ```
 (5)
 ```adastra.py
 import pygame
 ```
 (6)
 ```python bird.py
 import pygame
 ```
 (7)
 ```obstacle.py python
 import pygame
 ```
 (8)
 ```major1.py````
 ```python
 import pygame
 ```
 (9)
 ```major2.py````
 ```python
 import pygame
 ```
 (10)
 ```js
 // File: bruh.js
 const a = 1;
 ```
 (11)
 ```swag.tsx
 // File: swag.tsx
 const a: number = 1;
 ```
 (12)
 ```gmoita.ts
 // File: gmoita.tsx
 const a: number = 1;
 ```
 (13)
 ** file1.py **
 ```python
 import pygame
 ```
 (14)
 **file2.py**
 ```python
 import pygame
 ```
 (15)
 #### `gm.py`
 ```python
 import pygame
 ```
 '''
 class TestChatParsing(unittest.TestCase):
    def setUp(self):
        self._expected_filenames = (
            'main.py',
            'entry.py',
            'rickroll.py',
            'engineer.py',
            'adastra.py',
            'bird.py',
            'obstacle.py',
            'major1.py',
            'major2.py',
            'bruh.js',
            'swag.tsx',
            'gmoita.ts',
            'file1.py',
            'file2.py',
            'gm.py',
        )
        self.chat = CODE_FORMATS
    def test_parsing(self):
        codefiles = parse_chat(self.chat)
        self.assertEqual(len(codefiles), len(self._expected_filenames))
        for i, cf in enumerate(codefiles):
            filename, content = cf
            self.assertEqual(filename, self._expected_filenames[i])
            self.assertNotEqual(content, '')
 if __name__ == '__main__':
    unittest.main()