fix: take last matching block and support pdf multi media

This commit is contained in:
Florian Hönicke
2023-05-03 11:51:11 +02:00
parent 132c01086c
commit d1d3617978
2 changed files with 4 additions and 4 deletions

View File

@@ -47,9 +47,9 @@ class Generator:
def extract_content_from_result(self, plain_text, file_name, match_single_block=False, can_contain_code_block=True):
optional_line_break = '\n' if can_contain_code_block else '' # the \n at the end makes sure that ``` within the generated code is not matched because it is not right before a line break
pattern = fr"\*?\*?{file_name}\*?\*?\n```(?:\w+\n)?([\s\S]*?){optional_line_break}```"
match = re.search(pattern, plain_text, re.MULTILINE)
if match:
return match.group(1).strip()
matches = re.findall(pattern, plain_text, re.MULTILINE)
if matches:
return matches[-1].strip()
elif match_single_block:
# Check for a single code block
single_code_block_pattern = r"```(?:\w+\n)?([\s\S]*?)```"

View File

@@ -2,7 +2,7 @@ FROM jinaai/jina:3.14.1-py39-standard
# update pip
RUN pip install --upgrade pip
RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config poppler-utils {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/*
## install requirements for the executor
COPY requirements.txt .