From c821b294c619c604099b7a9497cc967d9e65ca29 Mon Sep 17 00:00:00 2001 From: ido777 Date: Sat, 15 Jul 2023 01:23:59 +0300 Subject: [PATCH] Fix orjson encoding text with UTF-8 surrogates (#3666) * added lib ftfy (fixes text for you), to solve surrogates errors --------- Co-authored-by: Reinier van der Leer --- autogpt/memory/vector/memory_item.py | 4 ++++ requirements.txt | 1 + 2 files changed, 5 insertions(+) diff --git a/autogpt/memory/vector/memory_item.py b/autogpt/memory/vector/memory_item.py index 587a915b..f7a7fe6e 100644 --- a/autogpt/memory/vector/memory_item.py +++ b/autogpt/memory/vector/memory_item.py @@ -4,6 +4,7 @@ import dataclasses import json from typing import Literal +import ftfy import numpy as np from autogpt.config import Config @@ -43,6 +44,9 @@ class MemoryItem: ): logger.debug(f"Memorizing text:\n{'-'*32}\n{text}\n{'-'*32}\n") + # Fix encoding, e.g. removing unicode surrogates (see issue #778) + text = ftfy.fix_text(text) + chunks = [ chunk for chunk, _ in ( diff --git a/requirements.txt b/requirements.txt index 47aa08a6..4af8bccd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ google-api-python-client #(https://developers.google.com/custom-search/v1/overvi pinecone-client==2.2.1 redis orjson==3.8.10 +ftfy>=6.1.1 Pillow selenium==4.1.4 webdriver-manager