mirror of
https://github.com/aljazceru/gpt-engineer.git
synced 2025-12-17 20:55:09 +01:00
Improved chat parsing with no AI logic (#120)
This commit is contained in:
@@ -2,21 +2,82 @@ import re
|
||||
from typing import List, Tuple
|
||||
from gpt_engineer.db import DB
|
||||
|
||||
# Amount of lines within the code block to consider for filename discovery
|
||||
N_CODELINES_FOR_FILENAME_TA = 5
|
||||
|
||||
def parse_chat(chat) -> List[Tuple[str, str]]:
|
||||
# Get all ``` blocks
|
||||
regex = r"```(.*?)```"
|
||||
# Default path to use if no filename is found
|
||||
DEFAULT_PATH = 'unknown.txt'
|
||||
|
||||
matches = re.finditer(regex, chat, re.DOTALL)
|
||||
|
||||
def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
|
||||
'''
|
||||
Parses a chat message and returns a list of tuples containing
|
||||
the file path and the code content for each file.
|
||||
'''
|
||||
code_regex = r'```(.*?)```'
|
||||
filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'
|
||||
|
||||
# Get all ``` (code) blocks
|
||||
code_matches = re.finditer(code_regex, chat, re.DOTALL)
|
||||
|
||||
prev_code_y_end = 0
|
||||
files = []
|
||||
for match in matches:
|
||||
path = match.group(1).split("\n")[0]
|
||||
# Get the code
|
||||
code = match.group(1).split("\n")[1:]
|
||||
code = "\n".join(code)
|
||||
# Add the file to the list
|
||||
files.append((path, code))
|
||||
for match in code_matches:
|
||||
lines = match.group(1).split('\n')
|
||||
code_y_start = match.start()
|
||||
code_y_end = match.end()
|
||||
|
||||
# Now, we need to get the filename associated with this code block.
|
||||
# We will look for the filename somewhere near the code block start.
|
||||
#
|
||||
# This "somewhere near" is referred to as the "filename_ta", to
|
||||
# resemble a sort-of target area (ta).
|
||||
#
|
||||
# The target area includes the text preceding the code block that
|
||||
# does not belong to previous code blocks ("no_code").
|
||||
# Additionally, as sometimes the filename is defined within
|
||||
# the code block itself, we will also include the first few lines
|
||||
# of the code block in the filename_ta.
|
||||
#
|
||||
# Example:
|
||||
# ```python
|
||||
# # File: entrypoint.py
|
||||
# import pygame
|
||||
# ```
|
||||
#
|
||||
# The amount of lines to consider within the code block is set by
|
||||
# the constant 'N_CODELINES_FOR_FILENAME_TA'.
|
||||
#
|
||||
# Get the "preceding" text, which is located between codeblocks
|
||||
no_code = chat[prev_code_y_end:code_y_start].strip()
|
||||
within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
|
||||
filename_ta = no_code + '\n' + within_code
|
||||
|
||||
# The path is the filename itself which we greedily match
|
||||
filename = re.search(filename_regex, filename_ta)
|
||||
path = filename.group(0) if filename else DEFAULT_PATH
|
||||
|
||||
# Visualize the filename_ta if verbose
|
||||
if verbose:
|
||||
print('-' * 20)
|
||||
print(f'Path: {path}')
|
||||
print('-' * 20)
|
||||
print(filename_ta)
|
||||
print('-' * 20)
|
||||
|
||||
# Check if its not a false positive
|
||||
#
|
||||
# For instance, the match with ```main.py``` should not be considered.
|
||||
# ```main.py```
|
||||
# ```python
|
||||
# ...
|
||||
# ```
|
||||
if not re.fullmatch(filename_regex, '\n'.join(lines)):
|
||||
# Update the previous code block end
|
||||
prev_code_y_end = code_y_end
|
||||
|
||||
# File and code have been matched, add them to the list
|
||||
files.append((path, '\n'.join(lines[1:])))
|
||||
|
||||
return files
|
||||
|
||||
|
||||
129
tests/test_chat_parser.py
Normal file
129
tests/test_chat_parser.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import unittest
|
||||
from gpt_engineer.chat_to_files import parse_chat
|
||||
|
||||
CODE_FORMATS = '''
|
||||
(1)
|
||||
File: main.py
|
||||
|
||||
```python
|
||||
import pygame
|
||||
````
|
||||
|
||||
(2)
|
||||
entry.py
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(3)
|
||||
```python
|
||||
# File: rickroll.py
|
||||
import pygame
|
||||
```
|
||||
|
||||
(4)
|
||||
```python
|
||||
|
||||
# File: engineer.py
|
||||
import pygame
|
||||
```
|
||||
|
||||
(5)
|
||||
```adastra.py
|
||||
import pygame
|
||||
```
|
||||
|
||||
(6)
|
||||
```python bird.py
|
||||
import pygame
|
||||
```
|
||||
|
||||
(7)
|
||||
```obstacle.py python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(8)
|
||||
```major1.py````
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(9)
|
||||
```major2.py````
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(10)
|
||||
```js
|
||||
// File: bruh.js
|
||||
const a = 1;
|
||||
```
|
||||
|
||||
(11)
|
||||
```swag.tsx
|
||||
// File: swag.tsx
|
||||
const a: number = 1;
|
||||
```
|
||||
|
||||
(12)
|
||||
```gmoita.ts
|
||||
// File: gmoita.tsx
|
||||
const a: number = 1;
|
||||
```
|
||||
|
||||
(13)
|
||||
** file1.py **
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(14)
|
||||
**file2.py**
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
|
||||
(15)
|
||||
#### `gm.py`
|
||||
```python
|
||||
import pygame
|
||||
```
|
||||
'''
|
||||
|
||||
class TestChatParsing(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self._expected_filenames = (
|
||||
'main.py',
|
||||
'entry.py',
|
||||
'rickroll.py',
|
||||
'engineer.py',
|
||||
'adastra.py',
|
||||
'bird.py',
|
||||
'obstacle.py',
|
||||
'major1.py',
|
||||
'major2.py',
|
||||
'bruh.js',
|
||||
'swag.tsx',
|
||||
'gmoita.ts',
|
||||
'file1.py',
|
||||
'file2.py',
|
||||
'gm.py',
|
||||
)
|
||||
self.chat = CODE_FORMATS
|
||||
|
||||
def test_parsing(self):
|
||||
codefiles = parse_chat(self.chat)
|
||||
|
||||
self.assertEqual(len(codefiles), len(self._expected_filenames))
|
||||
for i, cf in enumerate(codefiles):
|
||||
filename, content = cf
|
||||
|
||||
self.assertEqual(filename, self._expected_filenames[i])
|
||||
self.assertNotEqual(content, '')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user