From 3d8b478e53f1476273b06d8cc09e93ec7abf9db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 10:29:27 +0200 Subject: [PATCH 01/10] =?UTF-8?q?=E2=9C=85=20feat:=20requirements=20txt=20?= =?UTF-8?q?validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/apis/pypi.py | 65 ++++++++++++++++++++++++++++++- src/options/generate/generator.py | 3 +- test/unit/test_api.py | 31 ++++++++++++++- 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/src/apis/pypi.py b/src/apis/pypi.py index 7b0456f..4351e2a 100644 --- a/src/apis/pypi.py +++ b/src/apis/pypi.py @@ -1,4 +1,10 @@ +import os +import re +from datetime import datetime + import requests +from packaging import version + def is_package_on_pypi(package_name, version=None): optional_version = f"/{version}" if version else "" @@ -9,4 +15,61 @@ def is_package_on_pypi(package_name, version=None): elif response.status_code == 404: return False else: - return None \ No newline at end of file + return None + + +def get_latest_package_version(package_name): + url = f'https://pypi.org/pypi/{package_name}/json' + response = requests.get(url) + if response.status_code != 200: + return None + data = response.json() + releases = data['releases'] + + # Get package versions not older than 2021 + valid_versions = [] + for v, release_info in releases.items(): + upload_time = datetime.strptime(release_info[0]['upload_time'], '%Y-%m-%dT%H:%M:%S') + if upload_time.year <= 2021: + valid_versions.append(v) + + v = max(valid_versions, key=version.parse) if valid_versions else None + return v + + +def clean_requirements_txt(previous_microservice_path): + requirements_txt_path = os.path.join(previous_microservice_path, 'requirements.txt') + with open(requirements_txt_path, 'r', encoding='utf-8') as f: + requirements_txt = f.read() + + updated_requirements = [] + + for line in requirements_txt.split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + + split = re.split(r'==|>=|<=|>|<|~=', line) + if len(split) == 1: + version = None + package_name = split[0] + elif len(split) == 2: + package_name, version = split + else: + raise ValueError(f'Could not parse line {line} in requirements.txt') + + # Keep lines with jina, docarray, openai, pytest unchanged + if package_name in {'jina', 'docarray', 'openai', 'pytest'}: + updated_requirements.append(line) + continue + if is_package_on_pypi(package_name): + if version is None or not is_package_on_pypi(package_name, version): + latest_version = get_latest_package_version(package_name) + if latest_version is None: + raise ValueError(f'Package {package_name} not found on PyPI') + updated_requirements.append(f'{package_name}~={latest_version}') + else: + updated_requirements.append(line) + + with open(requirements_txt_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(updated_requirements)) diff --git a/src/options/generate/generator.py b/src/options/generate/generator.py index 7467d5e..c463db2 100644 --- a/src/options/generate/generator.py +++ b/src/options/generate/generator.py @@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass from src.apis import gpt from src.apis.gpt import _GPTConversation from src.apis.jina_cloud import process_error_message, push_executor, is_executor_in_hub -from src.apis.pypi import is_package_on_pypi +from src.apis.pypi import is_package_on_pypi, get_latest_package_version, clean_requirements_txt from src.constants import FILE_AND_TAG_PAIRS, NUM_IMPLEMENTATION_STRATEGIES, MAX_DEBUGGING_ITERATIONS, \ BLACKLISTED_PACKAGES, EXECUTOR_FILE_NAME, TEST_EXECUTOR_FILE_NAME, TEST_EXECUTOR_FILE_TAG, \ REQUIREMENTS_FILE_NAME, REQUIREMENTS_FILE_TAG, DOCKER_FILE_NAME, IMPLEMENTATION_FILE_NAME, \ @@ -303,6 +303,7 @@ pytest num_approach, i) next_microservice_path = get_microservice_path(self.microservice_root_path, microservice_name, packages, num_approach, i + 1) + clean_requirements_txt(previous_microservice_path) log_hubble = push_executor(previous_microservice_path) error = process_error_message(log_hubble) if error: diff --git a/test/unit/test_api.py b/test/unit/test_api.py index 5202a89..8ebb124 100644 --- a/test/unit/test_api.py +++ b/test/unit/test_api.py @@ -1,5 +1,7 @@ +import os + from src.apis.jina_cloud import is_executor_in_hub -from src.apis.pypi import is_package_on_pypi +from src.apis.pypi import is_package_on_pypi, clean_requirements_txt from src.options.generate.generator import Generator @@ -32,3 +34,30 @@ def test_filter_packages_list(): ["gpt_3_5_turbo", "requests", "pydub"], ["requests", "gtts"] ] + + +def test_precheck_requirements_txt(tmpdir): + requirements_content = """\ +jina==1.2.3 +docarray==1.2.3 +requests~=2.26.0 +gtts~=2.2.3 +pydub~=123.123.123 +base64~=3.3.0 +""" + requirements_clean = """\ +jina==1.2.3 +docarray==1.2.3 +requests~=2.26.0 +gtts~=2.2.3 +pydub~=0.25.1""" + requirements_txt_path = os.path.join(tmpdir, "requirements.txt") + with open(requirements_txt_path, "w", encoding="utf-8") as f: + f.write(requirements_content) + + clean_requirements_txt(tmpdir) + + with open(requirements_txt_path, "r", encoding="utf-8") as f: + updated_requirements = f.read() + + assert updated_requirements == requirements_clean From 132c01086cb2a3e7199f03c2b3974e7a839ef724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 10:56:21 +0200 Subject: [PATCH 02/10] =?UTF-8?q?=E2=9C=85=20feat:=20requirements=20txt=20?= =?UTF-8?q?validation=20update=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/apis/pypi.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/apis/pypi.py b/src/apis/pypi.py index 4351e2a..7c57a82 100644 --- a/src/apis/pypi.py +++ b/src/apis/pypi.py @@ -7,6 +7,9 @@ from packaging import version def is_package_on_pypi(package_name, version=None): + """ + Returns True if the package is on PyPI, False if it is not, and None if the status code is not 200 or 404. + """ optional_version = f"/{version}" if version else "" url = f"https://pypi.org/pypi/{package_name}{optional_version}/json" response = requests.get(url) @@ -19,6 +22,9 @@ def is_package_on_pypi(package_name, version=None): def get_latest_package_version(package_name): + """ + Returns the latest version of a package that is not older than 2021. + """ url = f'https://pypi.org/pypi/{package_name}/json' response = requests.get(url) if response.status_code != 200: @@ -38,6 +44,11 @@ def get_latest_package_version(package_name): def clean_requirements_txt(previous_microservice_path): + """ + It can happen that the generated requirements.txt contains packages that are not on PyPI (like base64). + In this case, we remove the requirement from requirements.txt. + In case the package is on PyPI, but the version is not, we update the version to the latest version that is still not older than 2021. + """ requirements_txt_path = os.path.join(previous_microservice_path, 'requirements.txt') with open(requirements_txt_path, 'r', encoding='utf-8') as f: requirements_txt = f.read() From be50fc4d6dca2592d0d28da49ee91334d844c517 Mon Sep 17 00:00:00 2001 From: Jina Dev Bot Date: Wed, 3 May 2023 09:07:52 +0000 Subject: [PATCH 03/10] chore(version): the next version will be 0.18.35 build(joschkabraun): Fix langchain version --- src/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index ac07088..07194e5 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,3 +1,3 @@ -__version__ = '0.18.34' +__version__ = '0.18.35' from src.cli import main \ No newline at end of file From d1d361797893f01aedd07969e6200d23b89f719e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 11:51:11 +0200 Subject: [PATCH 04/10] =?UTF-8?q?=E2=9C=85=20fix:=20take=20last=20matching?= =?UTF-8?q?=20block=20and=20support=20pdf=20multi=20media?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/generator.py | 6 +++--- src/options/generate/static_files/microservice/Dockerfile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/options/generate/generator.py b/src/options/generate/generator.py index c463db2..2710691 100644 --- a/src/options/generate/generator.py +++ b/src/options/generate/generator.py @@ -47,9 +47,9 @@ class Generator: def extract_content_from_result(self, plain_text, file_name, match_single_block=False, can_contain_code_block=True): optional_line_break = '\n' if can_contain_code_block else '' # the \n at the end makes sure that ``` within the generated code is not matched because it is not right before a line break pattern = fr"\*?\*?{file_name}\*?\*?\n```(?:\w+\n)?([\s\S]*?){optional_line_break}```" - match = re.search(pattern, plain_text, re.MULTILINE) - if match: - return match.group(1).strip() + matches = re.findall(pattern, plain_text, re.MULTILINE) + if matches: + return matches[-1].strip() elif match_single_block: # Check for a single code block single_code_block_pattern = r"```(?:\w+\n)?([\s\S]*?)```" diff --git a/src/options/generate/static_files/microservice/Dockerfile b/src/options/generate/static_files/microservice/Dockerfile index 29805eb..7822c80 100644 --- a/src/options/generate/static_files/microservice/Dockerfile +++ b/src/options/generate/static_files/microservice/Dockerfile @@ -2,7 +2,7 @@ FROM jinaai/jina:3.14.1-py39-standard # update pip RUN pip install --upgrade pip -RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config poppler-utils {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/* ## install requirements for the executor COPY requirements.txt . From dc6ae5b3cf4ffcd2214e3b5436081b13c31aa6b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 12:10:45 +0200 Subject: [PATCH 05/10] =?UTF-8?q?=E2=9C=85=20fix:=20first=20iteration=20do?= =?UTF-8?q?cker=20packages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/generator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/options/generate/generator.py b/src/options/generate/generator.py index 2710691..d293079 100644 --- a/src/options/generate/generator.py +++ b/src/options/generate/generator.py @@ -210,8 +210,10 @@ metas: with open(os.path.join(os.path.dirname(__file__), 'static_files', 'microservice', 'Dockerfile'), 'r', encoding='utf-8') as f: docker_file_template_lines = f.readlines() - docker_file_template_lines = [line for line in docker_file_template_lines if - not line.startswith('RUN apt-get update')] + docker_file_template_lines = [ + line.replace('{{apt_get_packages}}', '') + for line in docker_file_template_lines + ] docker_file_content = '\n'.join(docker_file_template_lines) persist_file(docker_file_content, os.path.join(MICROSERVICE_FOLDER_v1, 'Dockerfile')) From f754e871018ca29881d72ea63de5c12826532a46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 14:04:00 +0200 Subject: [PATCH 06/10] =?UTF-8?q?=E2=9C=85=20fix:=20approaches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/templates_user.py | 35 +++++++++++--------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/options/generate/templates_user.py b/src/options/generate/templates_user.py index 13b6114..759b05a 100644 --- a/src/options/generate/templates_user.py +++ b/src/options/generate/templates_user.py @@ -44,25 +44,6 @@ PDFParserExecutor ```''' ) - -template_generate_possible_packages_output_format_string = '''You must output the package combinations as a \ -list of lists wrapped into ``` and name it **strategies.json**. \ -Do not use quotation marks around packages names in the output. \ -Separate packages in a combination by comma. \ -Note that you can also leave a line empty to indicate that one of the strategies does not require any package and can be done in plain python. -Write the output using double asterisks and triple backticks like this: -**strategies.json** -``` -[ - ["package1", "package2", "package3"], - ["package4", "package5"], - ["package6", "package7", "package8", "package9"], - [], - ["package10"] -] -```''' - - template_generate_possible_packages = PromptTemplate.from_template( '''Here is the task description of the problem you need to solve: "{description}" @@ -84,7 +65,19 @@ When answering, just write "yes" or "no". If the package is mentioned in the description, then it is automatically the best one. If you listed gpt_3_5_turbo earlier, you must use it. gpt_3_5_turbo is the best package for handling text-based tasks. Also, gpt_3_5_turbo doesn't need any other packages processing text or using language models. It can handle any text-based task alone. -''' + template_generate_possible_packages_output_format_string) +You must output the package combinations as json wrapped into tripple backticks ``` and name it **strategies.json**. \ +Note that you can also leave a list empty to indicate that one of the strategies does not require any package and can be done in plain python. +Write the output using double asterisks and triple backticks like this: +**strategies.json** +``` +[ + ["package1", "package2", "package3"], + ["package4", "package5"], + ["package6", "package7", "package8", "package9"], + [], + ["package10"] +] +```''') template_code_wrapping_string = '''The code will go into {file_name_purpose}. @@ -211,7 +204,7 @@ The output would be: template_summarize_error = PromptTemplate.from_template( '''Here is an error message I encountered during the docker build process: "{error}" -Your task is to summarize the error message as compact and informative as possible while maintaining all information necessary to debug the core issue. +Your task is to summarize the error message as compact and informative as possible while maintaining all information necessary to debug the core issue (100 words). Warnings are not worth mentioning.''' ) From b5931a2368e6580e9a2a2ff1c070865d88f5585f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 14:15:48 +0200 Subject: [PATCH 07/10] =?UTF-8?q?=E2=9C=85=20fix:=20gpt=203=205=20confuse?= =?UTF-8?q?=20the=20approaches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/templates_user.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/options/generate/templates_user.py b/src/options/generate/templates_user.py index 759b05a..b9089f6 100644 --- a/src/options/generate/templates_user.py +++ b/src/options/generate/templates_user.py @@ -63,7 +63,6 @@ When answering, just write "yes" or "no". 6. Determine the 5 most suitable python package combinations, ordered from the best to the least suitable. Combine the packages to achieve a comprehensive solution. If the package is mentioned in the description, then it is automatically the best one. -If you listed gpt_3_5_turbo earlier, you must use it. gpt_3_5_turbo is the best package for handling text-based tasks. Also, gpt_3_5_turbo doesn't need any other packages processing text or using language models. It can handle any text-based task alone. You must output the package combinations as json wrapped into tripple backticks ``` and name it **strategies.json**. \ Note that you can also leave a list empty to indicate that one of the strategies does not require any package and can be done in plain python. @@ -122,7 +121,7 @@ Obey the following rules: Your approach: 1. Identify the core challenge when implementing the function. -2. Think about solutions for these challenges. If gpt_3_5_turbo is mentioned in the above list of packages, then you must use it. +2. Think about solutions for these challenges. 3. Decide for one of the solutions. 4. Write the code for the function. Don't write code for the test. ''' + gpt_35_turbo_usage_string + '\n' + template_code_wrapping_string From 7c1ed937c4eeab3ee2edb9fd3838e27aa54d55a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 14:45:33 +0200 Subject: [PATCH 08/10] =?UTF-8?q?=E2=9C=85=20fix:=20gpt=203=205=20confuse?= =?UTF-8?q?=20the=20approaches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/templates_user.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/options/generate/templates_user.py b/src/options/generate/templates_user.py index b9089f6..4a2e1c3 100644 --- a/src/options/generate/templates_user.py +++ b/src/options/generate/templates_user.py @@ -44,14 +44,14 @@ PDFParserExecutor ```''' ) +# todo split into multiple calls. One for brainstorming - one for the final answer template_generate_possible_packages = PromptTemplate.from_template( '''Here is the task description of the problem you need to solve: "{description}" -1. Write down ut to 3 different strategies to solve the task. For each strategy write down all the non-trivial subtasks you need to solve. If there is a natural language understanding or generation stragegy, write it down. -2. Find out what is the core problem to solve. -3. List up to 10 Python packages that are specifically designed or have functionalities to solve the complete core problem with one of the defined strategies. You must add gpt_3_5_turbo if the task involves generating or understanding natural language or using a (pre-trained) language model. -4. Exclude any package that can generate or understand natural language or enables using any language model, but you must not exclude gpt_3_5_turbo. Print the cleaned list of packages and give a brief reason for keeping it after its name. -5. For each cleaned package think if it fulfills the following requirements: +1. Write down up to 3 different strategies to solve the task. For each strategy write down how it solves the core problems. +Note that packages are preferred over external apis except if it is mentioned in the description. +2. For each strategy list up to 3 Python packages that are specifically designed or have functionalities to solve the complete core problems. +3. For each package think if it fulfills the following requirements: a) specifically designed or have functionalities to solve the complete core problem. b) has a stable api among different versions c) does not have system requirements @@ -61,9 +61,7 @@ e) the implementation of the core problem using the package would obey the follo When answering, just write "yes" or "no". -6. Determine the 5 most suitable python package combinations, ordered from the best to the least suitable. Combine the packages to achieve a comprehensive solution. -If the package is mentioned in the description, then it is automatically the best one. - +4. For each approach, list the required python package combinations as discibed in the following. You must output the package combinations as json wrapped into tripple backticks ``` and name it **strategies.json**. \ Note that you can also leave a list empty to indicate that one of the strategies does not require any package and can be done in plain python. Write the output using double asterisks and triple backticks like this: From c76d4c3e03a1ac453a837145eb61cb233f4003e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 14:48:33 +0200 Subject: [PATCH 09/10] =?UTF-8?q?=E2=9C=85=20fix:=20docker=20dependency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/static_files/microservice/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options/generate/static_files/microservice/Dockerfile b/src/options/generate/static_files/microservice/Dockerfile index 7822c80..681ad0c 100644 --- a/src/options/generate/static_files/microservice/Dockerfile +++ b/src/options/generate/static_files/microservice/Dockerfile @@ -2,7 +2,7 @@ FROM jinaai/jina:3.14.1-py39-standard # update pip RUN pip install --upgrade pip -RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config poppler-utils {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install --no-install-recommends -y ffmpeg build-essential pkg-config libpoppler-cpp-dev {{apt_get_packages}} && apt-get clean && rm -rf /var/lib/apt/lists/* ## install requirements for the executor COPY requirements.txt . From c0487dbb1ab8b95daa96756efeb95b81ac79d263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Ho=CC=88nicke?= Date: Wed, 3 May 2023 15:12:23 +0200 Subject: [PATCH 10/10] =?UTF-8?q?=E2=9C=85=20fix:=20docker=20dependency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/options/generate/templates_user.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/options/generate/templates_user.py b/src/options/generate/templates_user.py index 4a2e1c3..12f34e7 100644 --- a/src/options/generate/templates_user.py +++ b/src/options/generate/templates_user.py @@ -201,7 +201,9 @@ The output would be: template_summarize_error = PromptTemplate.from_template( '''Here is an error message I encountered during the docker build process: "{error}" -Your task is to summarize the error message as compact and informative as possible while maintaining all information necessary to debug the core issue (100 words). +Your task is to summarize the error message as compact and informative as possible \ +while maintaining all information necessary to debug the core issue (100 words). +Note that you must not suggest a solution to the error. Warnings are not worth mentioning.''' )