Merge branch 'main' of https://github.com/jina-ai/gptdeploy into feat-avoid-loop

# Conflicts: # src/options/generate/templates_user.py
2026-01-04 22:34:21 +01:00 · 2023-05-03 16:17:53 +02:00
parent 0763b9a12e 2a538533e3
commit e8e6bea087
6 changed files with 141 additions and 41 deletions
--- a/src/init.py
+++ b/src/init.py
@@ -1,3 +1,3 @@
-__version__ = '0.18.34'
+__version__ = '0.18.35'

 from src.cli import main
--- a/src/apis/pypi.py
+++ b/src/apis/pypi.py
@@ -1,6 +1,15 @@
+import os
+import re
+from datetime import datetime
+
 import requests
+from packaging import version
+

 def is_package_on_pypi(package_name, version=None):
+    """
+    Returns True if the package is on PyPI, False if it is not, and None if the status code is not 200 or 404.
+    """
    optional_version = f"/{version}" if version else ""
    url = f"https://pypi.org/pypi/{package_name}{optional_version}/json"
    response = requests.get(url)
@@ -9,4 +18,69 @@ def is_package_on_pypi(package_name, version=None):
    elif response.status_code == 404:
        return False
    else:
-        return None
+        return None
+
+
+def get_latest_package_version(package_name):
+    """
+    Returns the latest version of a package that is not older than 2021.
+    """
+    url = f'https://pypi.org/pypi/{package_name}/json'
+    response = requests.get(url)
+    if response.status_code != 200:
+        return None
+    data = response.json()
+    releases = data['releases']
+
+    # Get package versions not older than 2021
+    valid_versions = []
+    for v, release_info in releases.items():
+        upload_time = datetime.strptime(release_info[0]['upload_time'], '%Y-%m-%dT%H:%M:%S')
+        if upload_time.year <= 2021:
+            valid_versions.append(v)
+
+    v = max(valid_versions, key=version.parse) if valid_versions else None
+    return v
+
+
+def clean_requirements_txt(previous_microservice_path):
+    """
+    It can happen that the generated requirements.txt contains packages that are not on PyPI (like base64).
+    In this case, we remove the requirement from requirements.txt.
+    In case the package is on PyPI, but the version is not, we update the version to the latest version that is still not older than 2021.
+    """
+    requirements_txt_path = os.path.join(previous_microservice_path, 'requirements.txt')
+    with open(requirements_txt_path, 'r', encoding='utf-8') as f:
+        requirements_txt = f.read()
+
+    updated_requirements = []
+
+    for line in requirements_txt.split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+
+        split = re.split(r'==|>=|<=|>|<|~=', line)
+        if len(split) == 1:
+            version = None
+            package_name = split[0]
+        elif len(split) == 2:
+            package_name, version = split
+        else:
+            raise ValueError(f'Could not parse line {line} in requirements.txt')
+
+        # Keep lines with jina, docarray, openai, pytest unchanged
+        if package_name in {'jina', 'docarray', 'openai', 'pytest'}:
+            updated_requirements.append(line)
+            continue
+        if is_package_on_pypi(package_name):
+            if version is None or not is_package_on_pypi(package_name, version):
+                latest_version = get_latest_package_version(package_name)
+                if latest_version is None:
+                    raise ValueError(f'Package {package_name} not found on PyPI')
+                updated_requirements.append(f'{package_name}~={latest_version}')
+            else:
+                updated_requirements.append(line)
+
+    with open(requirements_txt_path, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(updated_requirements))
--- a/src/options/generate/generator.py
+++ b/src/options/generate/generator.py
@@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
 from src.apis import gpt
 from src.apis.gpt import _GPTConversation
 from src.apis.jina_cloud import process_error_message, push_executor, is_executor_in_hub
-from src.apis.pypi import is_package_on_pypi
+from src.apis.pypi import is_package_on_pypi, get_latest_package_version, clean_requirements_txt
 from src.constants import FILE_AND_TAG_PAIRS, NUM_IMPLEMENTATION_STRATEGIES, MAX_DEBUGGING_ITERATIONS, \
    BLACKLISTED_PACKAGES, EXECUTOR_FILE_NAME, TEST_EXECUTOR_FILE_NAME, TEST_EXECUTOR_FILE_TAG, \
    REQUIREMENTS_FILE_NAME, REQUIREMENTS_FILE_TAG, DOCKER_FILE_NAME, IMPLEMENTATION_FILE_NAME, \
@@ -49,9 +49,9 @@ class Generator:
    def extract_content_from_result(self, plain_text, file_name, match_single_block=False, can_contain_code_block=True):
        optional_line_break = '\n' if can_contain_code_block else ''  # the \n at the end makes sure that ``` within the generated code is not matched because it is not right before a line break
        pattern = fr"\*?\*?{file_name}\*?\*?\n```(?:\w+\n)?([\s\S]*?){optional_line_break}```"
-        match = re.search(pattern, plain_text, re.MULTILINE)
-        if match:
-            return match.group(1).strip()
+        matches = re.findall(pattern, plain_text, re.MULTILINE)
+        if matches:
+            return matches[-1].strip()
        elif match_single_block:
            # Check for a single code block
            single_code_block_pattern = r"```(?:\w+\n)?([\s\S]*?)```"
@@ -212,8 +212,10 @@ metas:
        with open(os.path.join(os.path.dirname(__file__), 'static_files', 'microservice', 'Dockerfile'), 'r',
                  encoding='utf-8') as f:
            docker_file_template_lines = f.readlines()
-        docker_file_template_lines = [line for line in docker_file_template_lines if
-                                      not line.startswith('RUN apt-get update')]
+        docker_file_template_lines = [
+            line.replace('{{apt_get_packages}}', '')
+            for line in docker_file_template_lines
+        ]
        docker_file_content = '\n'.join(docker_file_template_lines)
        persist_file(docker_file_content, os.path.join(MICROSERVICE_FOLDER_v1, 'Dockerfile'))

@@ -305,6 +307,7 @@ pytest
                                                               num_approach, i)
            next_microservice_path = get_microservice_path(self.microservice_root_path, microservice_name, packages,
                                                           num_approach, i + 1)
+            clean_requirements_txt(previous_microservice_path)
            log_hubble = push_executor(previous_microservice_path)
            error = process_error_message(log_hubble)
            if error:
--- a/src/options/generate/static_files/microservice/Dockerfile
+++ b/src/options/generate/static_files/microservice/Dockerfile
@@ -2,7 +2,7 @@ FROM jinaai/jina:3.15.1-dev14-py39-standard
 # update pip
 RUN pip install --upgrade pip

-RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config libpoppler-cpp-dev {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/*

 ## install requirements for the executor
 COPY requirements.txt .
--- a/src/options/generate/templates_user.py
+++ b/src/options/generate/templates_user.py
@@ -44,12 +44,26 @@ PDFParserExecutor
 ```'''
 )

+# todo split into multiple calls. One for brainstorming - one for the final answer
+template_generate_possible_packages = PromptTemplate.from_template(
+    '''Here is the task description of the problem you need to solve:
+"{description}"
+1. Write down up to 3 different strategies to solve the task. For each strategy write down how it solves the core problems.
+Note that packages are preferred over external apis except if it is mentioned in the description.
+2. For each strategy list up to 3 Python packages that are specifically designed or have functionalities to solve the complete core problems.
+3. For each package think if it fulfills the following requirements:
+a) specifically designed or have functionalities to solve the complete core problem.
+b) has a stable api among different versions
+c) does not have system requirements
+d) can solve the task when running in a docker container
+e) the implementation of the core problem using the package would obey the following rules:
+''' + not_allowed_function_string + '''

-template_generate_possible_packages_output_format_string = '''You must output the package combinations as a \
-list of lists wrapped into ``` and name it **strategies.json**. \
-Do not use quotation marks around packages names in the output. \
-Separate packages in a combination by comma. \
-Note that you can also leave a line empty to indicate that one of the strategies does not require any package and can be done in plain python.
+When answering, just write "yes" or "no".
+
+4. For each approach, list the required python package combinations as discibed in the following.
+You must output the package combinations as json wrapped into tripple backticks ``` and name it **strategies.json**. \
+Note that you can also leave a list empty to indicate that one of the strategies does not require any package and can be done in plain python.
 Write the output using double asterisks and triple backticks like this:
 **strategies.json**
 ```
@@ -60,31 +74,7 @@ Write the output using double asterisks and triple backticks like this:
  [],
  ["package10"]
 ]
-```'''
-
-
-template_generate_possible_packages = PromptTemplate.from_template(
-    '''Here is the task description of the problem you need to solve:
-"{description}"
-1. Write down ut to 3 different strategies to solve the task. For each strategy write down all the non-trivial subtasks you need to solve. If there is a natural language understanding or generation stragegy, write it down.
-2. Find out what is the core problem to solve.
-3. List up to 10 Python packages that are specifically designed or have functionalities to solve the complete core problem with one of the defined strategies. You must add gpt_3_5_turbo if the task involves generating or understanding natural language or using a (pre-trained) language model.
-4. Exclude any package that can generate or understand natural language or enables using any language model, but you must not exclude gpt_3_5_turbo. Print the cleaned list of packages and give a brief reason for keeping it after its name.
-5. For each cleaned package think if it fulfills the following requirements:
-a) specifically designed or have functionalities to solve the complete core problem.
-b) has a stable api among different versions
-c) does not have system requirements
-d) can solve the task when running in a docker container
-e) the implementation of the core problem using the package would obey the following rules:
-''' + not_allowed_function_string + '''
-
-When answering, just write "yes" or "no".
-
-6. Determine the 5 most suitable python package combinations, ordered from the best to the least suitable. Combine the packages to achieve a comprehensive solution.
-If the package is mentioned in the description, then it is automatically the best one.
-If you listed gpt_3_5_turbo earlier, you must use it. gpt_3_5_turbo is the best package for handling text-based tasks. Also, gpt_3_5_turbo doesn't need any other packages processing text or using language models. It can handle any text-based task alone.
-
-''' + template_generate_possible_packages_output_format_string)
+```''')


 template_code_wrapping_string = '''The code will go into {file_name_purpose}.
@@ -129,7 +119,7 @@ Obey the following rules:

 Your approach:
 1. Identify the core challenge when implementing the function.
-2. Think about solutions for these challenges. If gpt_3_5_turbo is mentioned in the above list of packages, then you must use it.
+2. Think about solutions for these challenges.
 3. Decide for one of the solutions.
 4. Write the code for the function. Don't write code for the test.
 ''' + gpt_35_turbo_usage_string + '\n' + template_code_wrapping_string
@@ -211,6 +201,10 @@ The output would be:
 template_summarize_error = PromptTemplate.from_template(
    '''Your task is to condense an error encountered during the docker build process. The error message is as follows:
 "{error}"
+Your task is to summarize the error message as compact and informative as possible \
+while maintaining all information necessary to debug the core issue (100 words).
+Note that you must not suggest a solution to the error.
+Warnings are not worth mentioning.
 Your response should be concise and informative, highlighting the core issue while omitting any warnings. It should also provide some additional context regarding the specific file and line number where the error occurred. The actual core error message should also be included.'''
 )

--- a/test/unit/test_api.py
+++ b/test/unit/test_api.py
@@ -1,5 +1,7 @@
+import os
+
 from src.apis.jina_cloud import is_executor_in_hub
-from src.apis.pypi import is_package_on_pypi
+from src.apis.pypi import is_package_on_pypi, clean_requirements_txt
 from src.options.generate.generator import Generator


@@ -32,3 +34,30 @@ def test_filter_packages_list():
        ["gpt_3_5_turbo", "requests", "pydub"],
        ["requests", "gtts"]
    ]
+
+
+def test_precheck_requirements_txt(tmpdir):
+    requirements_content = """\
+jina==1.2.3
+docarray==1.2.3
+requests~=2.26.0
+gtts~=2.2.3
+pydub~=123.123.123
+base64~=3.3.0
+"""
+    requirements_clean = """\
+jina==1.2.3
+docarray==1.2.3
+requests~=2.26.0
+gtts~=2.2.3
+pydub~=0.25.1"""
+    requirements_txt_path = os.path.join(tmpdir, "requirements.txt")
+    with open(requirements_txt_path, "w", encoding="utf-8") as f:
+        f.write(requirements_content)
+
+    clean_requirements_txt(tmpdir)
+
+    with open(requirements_txt_path, "r", encoding="utf-8") as f:
+        updated_requirements = f.read()
+
+    assert updated_requirements == requirements_clean