Improved chat parsing with no AI logic (#120)

2026-02-04 12:45:56 +01:00 · 2023-06-18 13:10:08 +01:00
parent 9cc9cf7329
commit 8facedd8d1
2 changed files with 201 additions and 11 deletions
--- a/gpt_engineer/chat_to_files.py
+++ b/gpt_engineer/chat_to_files.py
@@ -2,21 +2,82 @@ import re
 from typing import List, Tuple
 from gpt_engineer.db import DB

+# Amount of lines within the code block to consider for filename discovery
+N_CODELINES_FOR_FILENAME_TA = 5

-def parse_chat(chat) -> List[Tuple[str, str]]:
-    # Get all ``` blocks
-    regex = r"```(.*?)```"
+# Default path to use if no filename is found
+DEFAULT_PATH = 'unknown.txt'

-    matches = re.finditer(regex, chat, re.DOTALL)

+def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
+    '''
+    Parses a chat message and returns a list of tuples containing
+    the file path and the code content for each file.
+    '''
+    code_regex = r'```(.*?)```'
+    filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'
+
+    # Get all ``` (code) blocks
+    code_matches = re.finditer(code_regex, chat, re.DOTALL)
+    
+    prev_code_y_end = 0
    files = []
-    for match in matches:
-        path = match.group(1).split("\n")[0]
-        # Get the code
-        code = match.group(1).split("\n")[1:]
-        code = "\n".join(code)
-        # Add the file to the list
-        files.append((path, code))
+    for match in code_matches:
+        lines = match.group(1).split('\n')
+        code_y_start = match.start()
+        code_y_end = match.end()
+
+        # Now, we need to get the filename associated with this code block.
+        # We will look for the filename somewhere near the code block start.
+        #
+        # This "somewhere near" is referred to as the "filename_ta", to
+        # resemble a sort-of target area (ta).
+        #
+        # The target area includes the text preceding the code block that
+        # does not belong to previous code blocks ("no_code").
+        # Additionally, as sometimes the filename is defined within
+        # the code block itself, we will also include the first few lines
+        # of the code block in the filename_ta.
+        #
+        # Example:
+        # ```python
+        # # File: entrypoint.py
+        # import pygame
+        # ```
+        #
+        # The amount of lines to consider within the code block is set by
+        # the constant 'N_CODELINES_FOR_FILENAME_TA'.
+        #
+        # Get the "preceding" text, which is located between codeblocks
+        no_code = chat[prev_code_y_end:code_y_start].strip()
+        within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
+        filename_ta = no_code + '\n' + within_code
+        
+        # The path is the filename itself which we greedily match
+        filename = re.search(filename_regex, filename_ta)
+        path = filename.group(0) if filename else DEFAULT_PATH
+
+        # Visualize the filename_ta if verbose
+        if verbose:
+            print('-' * 20)
+            print(f'Path: {path}')
+            print('-' * 20)
+            print(filename_ta)
+            print('-' * 20)
+        
+        # Check if its not a false positive
+        #
+        # For instance, the match with ```main.py``` should not be considered.
+        # ```main.py```
+        # ```python
+        # ...
+        # ```
+        if not re.fullmatch(filename_regex, '\n'.join(lines)):
+            # Update the previous code block end
+            prev_code_y_end = code_y_end
+
+            # File and code have been matched, add them to the list
+            files.append((path, '\n'.join(lines[1:])))

    return files

--- a/tests/test_chat_parser.py
+++ b/tests/test_chat_parser.py
@@ -0,0 +1,129 @@
+import unittest
+from gpt_engineer.chat_to_files import parse_chat
+
+CODE_FORMATS = '''
+(1)
+File: main.py
+
+```python
+import pygame
+````
+
+(2)
+entry.py
+```python
+import pygame
+```
+
+(3)
+```python
+# File: rickroll.py
+import pygame
+```
+
+(4)
+```python
+
+# File: engineer.py
+import pygame
+```
+
+(5)
+```adastra.py
+import pygame
+```
+
+(6)
+```python bird.py
+import pygame
+```
+
+(7)
+```obstacle.py python
+import pygame
+```
+
+(8)
+```major1.py````
+```python
+import pygame
+```
+
+(9)
+```major2.py````
+```python
+import pygame
+```
+
+(10)
+```js
+// File: bruh.js
+const a = 1;
+```
+
+(11)
+```swag.tsx
+// File: swag.tsx
+const a: number = 1;
+```
+
+(12)
+```gmoita.ts
+// File: gmoita.tsx
+const a: number = 1;
+```
+
+(13)
+** file1.py **
+```python
+import pygame
+```
+
+(14)
+**file2.py**
+```python
+import pygame
+```
+
+(15)
+#### `gm.py`
+```python
+import pygame
+```
+'''
+
+class TestChatParsing(unittest.TestCase):
+    
+    def setUp(self):
+        self._expected_filenames = (
+            'main.py',
+            'entry.py',
+            'rickroll.py',
+            'engineer.py',
+            'adastra.py',
+            'bird.py',
+            'obstacle.py',
+            'major1.py',
+            'major2.py',
+            'bruh.js',
+            'swag.tsx',
+            'gmoita.ts',
+            'file1.py',
+            'file2.py',
+            'gm.py',
+        )
+        self.chat = CODE_FORMATS
+
+    def test_parsing(self):
+        codefiles = parse_chat(self.chat)
+
+        self.assertEqual(len(codefiles), len(self._expected_filenames))
+        for i, cf in enumerate(codefiles):
+            filename, content = cf
+            
+            self.assertEqual(filename, self._expected_filenames[i])
+            self.assertNotEqual(content, '')
+
+if __name__ == '__main__':
+    unittest.main()
+