Fix the maximum context length issue by chunking (#3222)

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
2025-12-21 07:54:21 +01:00 · 2023-05-02 03:13:24 +09:00
parent 0ef6f06462
commit 4767fe63d3
9 changed files with 1801 additions and 43 deletions
--- a/autogpt/llm/llm_utils.py
+++ b/autogpt/llm/llm_utils.py
@@ -2,9 +2,12 @@ from __future__ import annotations

 import functools
 import time
+from itertools import islice
 from typing import List, Optional

+import numpy as np
 import openai
+import tiktoken
 from colorama import Fore, Style
 from openai.error import APIError, RateLimitError, Timeout

@@ -207,6 +210,23 @@ def create_chat_completion(
    return resp


+def batched(iterable, n):
+    """Batch data into tuples of length n. The last batch may be shorter."""
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+def chunked_tokens(text, tokenizer_name, chunk_length):
+    tokenizer = tiktoken.get_encoding(tokenizer_name)
+    tokens = tokenizer.encode(text)
+    chunks_iterator = batched(tokens, chunk_length)
+    yield from chunks_iterator
+
+
 def get_ada_embedding(text: str) -> List[float]:
    """Get an embedding from the ada model.

@@ -217,7 +237,7 @@ def get_ada_embedding(text: str) -> List[float]:
        List[float]: The embedding.
    """
    cfg = Config()
-    model = "text-embedding-ada-002"
+    model = cfg.embedding_model
    text = text.replace("\n", " ")

    if cfg.use_azure:
@@ -226,13 +246,7 @@ def get_ada_embedding(text: str) -> List[float]:
        kwargs = {"model": model}

    embedding = create_embedding(text, **kwargs)
-    api_manager = ApiManager()
-    api_manager.update_cost(
-        prompt_tokens=embedding.usage.prompt_tokens,
-        completion_tokens=0,
-        model=model,
-    )
-    return embedding["data"][0]["embedding"]
+    return embedding


@retry_openai_api()
@@ -251,8 +265,31 @@ def create_embedding(
        openai.Embedding: The embedding object.
    """
    cfg = Config()
-    return openai.Embedding.create(
-        input=[text],
-        api_key=cfg.openai_api_key,
-        **kwargs,
-    )
+    chunk_embeddings = []
+    chunk_lengths = []
+    for chunk in chunked_tokens(
+        text,
+        tokenizer_name=cfg.embedding_tokenizer,
+        chunk_length=cfg.embedding_token_limit,
+    ):
+        embedding = openai.Embedding.create(
+            input=[chunk],
+            api_key=cfg.openai_api_key,
+            **kwargs,
+        )
+        api_manager = ApiManager()
+        api_manager.update_cost(
+            prompt_tokens=embedding.usage.prompt_tokens,
+            completion_tokens=0,
+            model=cfg.embedding_model,
+        )
+        chunk_embeddings.append(embedding["data"][0]["embedding"])
+        chunk_lengths.append(len(chunk))
+
+    # do weighted avg
+    chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lengths)
+    chunk_embeddings = chunk_embeddings / np.linalg.norm(
+        chunk_embeddings
+    )  # normalize the length to one
+    chunk_embeddings = chunk_embeddings.tolist()
+    return chunk_embeddings