Update

2025-12-18 06:24:21 +01:00 · 2024-11-30 16:14:33 -05:00
parent 111e985afc
commit 72e75d4d7d
10 changed files with 123 additions and 11125 deletions
--- a/delete.py
+++ b/delete.py
@@ -0,0 +1,33 @@
 import requests
 import openai
 def mock_bank_api_call(data):
    # Mock response from the bank API
    bank_response = {
        "status": "success",
        "account_balance": 1000,
        "currency": "USD"
    }
    return bank_response
 def process_data_with_openai(data):
    # Call the mock bank API
    bank_data = mock_bank_api_call(data)
    # Prepare the prompt for OpenAI API
    prompt = f"Bank API returned the following data: {bank_data}. Process this data."
    # Call the OpenAI API
    openai.api_key = 'your-openai-api-key'
    response = openai.Completion.create(
        engine="davinci",
        prompt=prompt,
        max_tokens=50
    )
    return response.choices[0].text.strip()
 # Example usage
 data = {"account_id": "12345"}
 result = process_data_with_openai(data)
 print(result)
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
@@ -365,6 +365,27 @@
    "df.sample(20)[\"content\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pinecone_save_article(article:dict):\n",
    "    logger.info(\"Saving article to Pinecone\")\n",
    "    index = get_index()\n",
    "\n",
    "    # Expected Keys from Pinecone *MUST* include 'id' and 'values'\n",
    "    article[\"id\"] = article[\"article_id\"]\n",
    "    article[\"values\"] = vectorize(article[\"content\"])\n",
    "    \n",
    "    namespace = f\"IngestRSS-Articles\"\n",
    "    \n",
    "    logger.info(\"Upserting article to Pinecone\")\n",
    "    upsert_vectors(index, [article], namespace)\n",
    "    logger.info(f\"Successfully upserted article w/ article-id: {article[\"article_id\"]} to Pinecone index {index.name} with namespace {namespace}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/rss_feeds.json
+++ b/rss_feeds.json
--- a/src/infra/deploy_infrastructure.py
+++ b/src/infra/deploy_infrastructure.py
@@ -82,7 +82,7 @@ def get_or_create_kms_key():
        for key in response['Keys']:
            try:
                tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
-                if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # TODO: This is inefficient and should be fixed and more readable.
+                if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # FIXME: This is inefficient and should be fixed and more readable.
                    print(f"Found existing KMS key with ID: {key['KeyId']}")
                    return key['KeyId']
            except ClientError:
--- a/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
@@ -128,8 +128,6 @@ def create_function(lambda_client, function_name, runtime, role, handler, zip_fi
        raise
 def get_pillow_layer_arn():
    url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
    try:
        response = requests.get(url)
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
@@ -29,20 +29,19 @@ def get_index():
 def vectorize(article:str) -> list[float]:
    response = client.embeddings.create(
        input=article,
-        model="text-embedding-3-large"
+        model=os.getenv('OPENAI_EMBEDDING_MODEL') 
    )
    return response.data[0].embedding 
-def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
+def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted. 
    index.upsert(
        vectors=vectors,
        namespace=namespace
    )
-
+def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working. 
 def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
    if len(vector) != int(embedding_dim):
        raise ValueError("Length of vector does not match the embedding dimension")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -4,8 +4,7 @@ import os
 import logging
 from random import randint
 from datetime import datetime
-# TODO: Move this article storage logic to a separate module inside of lambda. 
+
 # TODO: Get better at handling loading local moduels insdie of the lambdda. 
 from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize
 logger = logging.getLogger()
@@ -35,7 +34,8 @@ def pinecone_save_article(article:dict):
    article["id"] = article["article_id"]
    article["values"] = vectorize(article["content"])
-    namespace = f"IngestRSS-Articles"
+    
    namespace = os.getenv('PINECONE_NAMESPACE')
    logger.info("Upserting article to Pinecone")
    upsert_vectors(index, [article], namespace) 
--- a/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
+++ b/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
@@ -10,17 +10,67 @@ def update_env_vars(function_name):
    lambda_client = boto3.client('lambda')
    env_vars = {
-        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
+        # AWS Configuration
        'AWS_REGION': os.environ.get('AWS_REGION'),
        'AWS_ACCOUNT_ID': os.environ.get('AWS_ACCOUNT_ID'),
        'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
        'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
        # Lambda Configuration
        'LAMBDA_FUNCTION_NAME': os.environ.get('LAMBDA_FUNCTION_NAME'),
        'STACK_BASE': os.environ.get('STACK_BASE'),
        'LAMBDA_EXECUTION_ROLE_NAME': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME'),
        'LAMBDA_ROLE_ARN': os.environ.get('LAMBDA_ROLE_ARN'),
        'LAMBDA_LAYER_VERSION': os.environ.get('LAMBDA_LAYER_VERSION'),
        'LAMBDA_LAYER_NAME': os.environ.get('LAMBDA_LAYER_NAME'),
        'LAMBDA_LAYER_ARN': os.environ.get('LAMBDA_LAYER_ARN'),
        'LAMBDA_RUNTIME': os.environ.get('LAMBDA_RUNTIME'),
        'LAMBDA_TIMEOUT': os.environ.get('LAMBDA_TIMEOUT', '300'),  # Reasonable default timeout
        'LAMBDA_MEMORY': os.environ.get('LAMBDA_MEMORY', '512'),  # Reasonable default memory
        # S3 Configuration
        'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
        'S3_LAMBDA_ZIPPED_BUCKET_NAME': os.environ.get('S3_LAMBDA_ZIPPED_BUCKET_NAME'),
        'S3_LAYER_BUCKET_NAME': os.environ.get('S3_LAYER_BUCKET_NAME'),
        'S3_LAYER_KEY_NAME': os.environ.get('S3_LAYER_KEY_NAME'),
        # DynamoDB Configuration
        'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
-        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
+        'DYNAMODB_TABLE_ARN': os.environ.get('DYNAMODB_TABLE_ARN'),
-        'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY'),
+        
        # SQS Configuration
        'SQS_QUEUE_NAME': os.environ.get('SQS_QUEUE_NAME'),
        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
        'SQS_QUEUE_ARN': os.environ.get('SQS_QUEUE_ARN'),
        # Queue Filler Lambda Configuration
        'QUEUE_FILLER_LAMBDA_NAME': os.environ.get('QUEUE_FILLER_LAMBDA_NAME'),
        'QUEUE_FILLER_LAMBDA_S3_KEY': os.environ.get('QUEUE_FILLER_LAMBDA_S3_KEY'),
        # Python Configuration
        'PYTHON_VERSION': os.environ.get('PYTHON_VERSION', '3.12'),  # Default Python version
        # Application Settings
        'APP_NAME': os.environ.get('APP_NAME', 'RSS Feed Processor'),  # Default app name is fine
        'VERSION': os.environ.get('VERSION', '1.0.0'),  # Default version is fine
        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),  # Default to INFO logging
        # Storage Configuration
        'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'),  # Default to s3 storage
        # Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
        'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
        'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
        'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
        'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
        # Vector Configuration
        'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
        'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
        'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
-        'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
+        
        # OpenAI Configuration
        'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
    }
--- a/template.env
+++ b/template.env
@@ -50,11 +50,12 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ] 
 PINECONE_API_KEY=***
 PINECONE_DB_NAME=open-rss-articles
 PINECONE_API_KEY=***
 PINECONE_SHARDS=***
 PINECONE_NAMESPACE=IngestRSS-Articles
 VECTOR_EMBEDDING_MODEL=***
 VECTOR_EMBEDDING_DIM=***
 VECTOR_SEARCH_METRIC=***
 PINECONE_DB_NAME=***
 OPENAI_API_KEY=sk**
 OPENAI_EMBEDDING_MODEL=text-embedding-3-large
--- a/todo.md
+++ b/todo.md
@@ -1,18 +1,9 @@
-# Before Public Launch
+# After Public Launch
 * Monthly Kaggle Dataset Publishing.
 * Test out Vector Databases at Small Scale.
 * Test out Vector Databases at Scale. 
 * check in to ensure text cleaning processes are working. [ ]
 * Fix up ReadMe
 * Publish RoadMap.
 * Tips on where to gather RSS Feeds. 
 * Public Launch Posts
    * Reddit
    * Twitter
    * Kaggle
 * Protocol for annotating data. 
    * Promotional Material 
    * Stuff that is not news