Update

2025-12-17 22:14:20 +01:00 · 2024-11-30 16:14:33 -05:00
parent 111e985afc
commit 72e75d4d7d
10 changed files with 123 additions and 11125 deletions
--- a/delete.py
+++ b/delete.py
@@ -0,0 +1,33 @@
+import requests
+import openai
+
+def mock_bank_api_call(data):
+    # Mock response from the bank API
+    bank_response = {
+        "status": "success",
+        "account_balance": 1000,
+        "currency": "USD"
+    }
+    return bank_response
+
+def process_data_with_openai(data):
+    # Call the mock bank API
+    bank_data = mock_bank_api_call(data)
+    
+    # Prepare the prompt for OpenAI API
+    prompt = f"Bank API returned the following data: {bank_data}. Process this data."
+
+    # Call the OpenAI API
+    openai.api_key = 'your-openai-api-key'
+    response = openai.Completion.create(
+        engine="davinci",
+        prompt=prompt,
+        max_tokens=50
+    )
+    
+    return response.choices[0].text.strip()
+
+# Example usage
+data = {"account_id": "12345"}
+result = process_data_with_openai(data)
+print(result)
--- a/example-notebooks/batch-notebook.ipynb
+++ b/example-notebooks/batch-notebook.ipynb
@@ -365,6 +365,27 @@
    "df.sample(20)[\"content\"].values"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pinecone_save_article(article:dict):\n",
+    "    logger.info(\"Saving article to Pinecone\")\n",
+    "    index = get_index()\n",
+    "\n",
+    "    # Expected Keys from Pinecone *MUST* include 'id' and 'values'\n",
+    "    article[\"id\"] = article[\"article_id\"]\n",
+    "    article[\"values\"] = vectorize(article[\"content\"])\n",
+    "    \n",
+    "    namespace = f\"IngestRSS-Articles\"\n",
+    "    \n",
+    "    logger.info(\"Upserting article to Pinecone\")\n",
+    "    upsert_vectors(index, [article], namespace)\n",
+    "    logger.info(f\"Successfully upserted article w/ article-id: {article[\"article_id\"]} to Pinecone index {index.name} with namespace {namespace}\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/rss_feeds.json
+++ b/rss_feeds.json
--- a/src/infra/deploy_infrastructure.py
+++ b/src/infra/deploy_infrastructure.py
@@ -82,7 +82,7 @@ def get_or_create_kms_key():
        for key in response['Keys']:
            try:
                tags = kms_client.list_resource_tags(KeyId=key['KeyId'])['Tags']
-                if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # TODO: This is inefficient and should be fixed and more readable.
+                if any(tag['TagKey'] == tag_key and tag['TagValue'] == tag_value for tag in tags) and any(tag['TagKey'] == 'region' and tag['TagValue'] == os.getenv("AWS_REGION") for tag in tags): # FIXME: This is inefficient and should be fixed and more readable.
                    print(f"Found existing KMS key with ID: {key['KeyId']}")
                    return key['KeyId']
            except ClientError:
--- a/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/deploy_rss_feed_lambda.py
@@ -128,8 +128,6 @@ def create_function(lambda_client, function_name, runtime, role, handler, zip_fi
        raise

 def get_pillow_layer_arn():
-
-    
    url = f"https://api.klayers.cloud/api/v2/p{os.getenv('PYTHON_VERSION')}/layers/latest/{os.getenv('AWS_REGION')}/json"
    try:
        response = requests.get(url)
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/analytics/embeddings/vector_db.py
@@ -29,20 +29,19 @@ def get_index():
 def vectorize(article:str) -> list[float]:
    response = client.embeddings.create(
        input=article,
-        model="text-embedding-3-large"
+        model=os.getenv('OPENAI_EMBEDDING_MODEL') 
    )
    
    return response.data[0].embedding 


-def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str):
+def upsert_vectors(index:Pinecone.Index, vectors:list[dict], namespace:str): # [ ] Check if the data is being upserted. 
    index.upsert(
        vectors=vectors,
        namespace=namespace
    )

-
-def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None):
+def query_vectors(index:Pinecone.Index, namespace:str, vector:list[float], top_k:int, filter_query:dict=None): # [ ]: Make sure this is working. 
    
    if len(vector) != int(embedding_dim):
        raise ValueError("Length of vector does not match the embedding dimension")
--- a/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
+++ b/src/infra/lambdas/RSSFeedProcessorLambda/src/data_storage.py
@@ -4,8 +4,7 @@ import os
 import logging
 from random import randint
 from datetime import datetime
-# TODO: Move this article storage logic to a separate module inside of lambda. 
-# TODO: Get better at handling loading local moduels insdie of the lambdda. 
+
 from analytics.embeddings.vector_db import get_index, upsert_vectors, vectorize

 logger = logging.getLogger()
@@ -35,7 +34,8 @@ def pinecone_save_article(article:dict):
    article["id"] = article["article_id"]
    article["values"] = vectorize(article["content"])
    
-    namespace = f"IngestRSS-Articles"
+    
+    namespace = os.getenv('PINECONE_NAMESPACE')
    
    logger.info("Upserting article to Pinecone")
    upsert_vectors(index, [article], namespace) 
--- a/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
+++ b/src/infra/lambdas/lambda_utils/update_lambda_env_vars.py
@@ -10,17 +10,67 @@ def update_env_vars(function_name):
    lambda_client = boto3.client('lambda')

    env_vars = {
-        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
+        # AWS Configuration
+        'AWS_REGION': os.environ.get('AWS_REGION'),
+        'AWS_ACCOUNT_ID': os.environ.get('AWS_ACCOUNT_ID'),
+        'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),
+        'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),
+        'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION'),
+        
+        # Lambda Configuration
+        'LAMBDA_FUNCTION_NAME': os.environ.get('LAMBDA_FUNCTION_NAME'),
+        'STACK_BASE': os.environ.get('STACK_BASE'),
+        'LAMBDA_EXECUTION_ROLE_NAME': os.environ.get('LAMBDA_EXECUTION_ROLE_NAME'),
+        'LAMBDA_ROLE_ARN': os.environ.get('LAMBDA_ROLE_ARN'),
+        'LAMBDA_LAYER_VERSION': os.environ.get('LAMBDA_LAYER_VERSION'),
+        'LAMBDA_LAYER_NAME': os.environ.get('LAMBDA_LAYER_NAME'),
+        'LAMBDA_LAYER_ARN': os.environ.get('LAMBDA_LAYER_ARN'),
+        'LAMBDA_RUNTIME': os.environ.get('LAMBDA_RUNTIME'),
+        'LAMBDA_TIMEOUT': os.environ.get('LAMBDA_TIMEOUT', '300'),  # Reasonable default timeout
+        'LAMBDA_MEMORY': os.environ.get('LAMBDA_MEMORY', '512'),  # Reasonable default memory
+        
+        # S3 Configuration
        'S3_BUCKET_NAME': os.environ.get('S3_BUCKET_NAME'),
+        'S3_LAMBDA_ZIPPED_BUCKET_NAME': os.environ.get('S3_LAMBDA_ZIPPED_BUCKET_NAME'),
+        'S3_LAYER_BUCKET_NAME': os.environ.get('S3_LAYER_BUCKET_NAME'),
+        'S3_LAYER_KEY_NAME': os.environ.get('S3_LAYER_KEY_NAME'),
+        
+        # DynamoDB Configuration
        'DYNAMODB_TABLE_NAME': os.environ.get('DYNAMODB_TABLE_NAME'),
-        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),
-        'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY'),
+        'DYNAMODB_TABLE_ARN': os.environ.get('DYNAMODB_TABLE_ARN'),
+        
+        # SQS Configuration
+        'SQS_QUEUE_NAME': os.environ.get('SQS_QUEUE_NAME'),
+        'SQS_QUEUE_URL': os.environ.get('SQS_QUEUE_URL'),
+        'SQS_QUEUE_ARN': os.environ.get('SQS_QUEUE_ARN'),
+        
+        # Queue Filler Lambda Configuration
+        'QUEUE_FILLER_LAMBDA_NAME': os.environ.get('QUEUE_FILLER_LAMBDA_NAME'),
+        'QUEUE_FILLER_LAMBDA_S3_KEY': os.environ.get('QUEUE_FILLER_LAMBDA_S3_KEY'),
+        
+        # Python Configuration
+        'PYTHON_VERSION': os.environ.get('PYTHON_VERSION', '3.12'),  # Default Python version
+        
+        # Application Settings
+        'APP_NAME': os.environ.get('APP_NAME', 'RSS Feed Processor'),  # Default app name is fine
+        'VERSION': os.environ.get('VERSION', '1.0.0'),  # Default version is fine
+        'LOG_LEVEL': os.environ.get('LOG_LEVEL', 'INFO'),  # Default to INFO logging
+        
+        # Storage Configuration
+        'STORAGE_STRATEGY': os.environ.get('STORAGE_STRATEGY', 's3'),  # Default to s3 storage
+        
+        # Pinecone Configuration (only used if STORAGE_STRATEGY is 'pinecone')
        'PINECONE_API_KEY': os.environ.get('PINECONE_API_KEY'),
+        'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
        'PINECONE_SHARDS': os.environ.get('PINECONE_SHARDS'),
+        'PINECONE_NAMESPACE': os.environ.get('PINECONE_NAMESPACE'),
+        
+        # Vector Configuration
        'VECTOR_EMBEDDING_MODEL': os.environ.get('VECTOR_EMBEDDING_MODEL'),
        'VECTOR_EMBEDDING_DIM': os.environ.get('VECTOR_EMBEDDING_DIM'),
        'VECTOR_SEARCH_METRIC': os.environ.get('VECTOR_SEARCH_METRIC'),
-        'PINECONE_DB_NAME': os.environ.get('PINECONE_DB_NAME'),
+        
+        # OpenAI Configuration
        'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY')
    }
    
--- a/template.env
+++ b/template.env
@@ -50,11 +50,12 @@ STORAGE_STRATEGY=s3 # 's3' or 'pinecone' will support others in the future.
 # Only need to fill out this if your storage strategy is pinecone [ Not currently supported. ] 
 PINECONE_API_KEY=***
 PINECONE_DB_NAME=open-rss-articles
-PINECONE_API_KEY=***
 PINECONE_SHARDS=***
+PINECONE_NAMESPACE=IngestRSS-Articles
+
 VECTOR_EMBEDDING_MODEL=***
 VECTOR_EMBEDDING_DIM=***
 VECTOR_SEARCH_METRIC=***
-PINECONE_DB_NAME=***

 OPENAI_API_KEY=sk**
+OPENAI_EMBEDDING_MODEL=text-embedding-3-large
--- a/todo.md
+++ b/todo.md
@@ -1,18 +1,9 @@
-# Before Public Launch
+# After Public Launch
+* Monthly Kaggle Dataset Publishing.

 * Test out Vector Databases at Small Scale.
 * Test out Vector Databases at Scale. 

-* check in to ensure text cleaning processes are working. [ ]
-
-* Fix up ReadMe
-* Publish RoadMap.
-* Tips on where to gather RSS Feeds. 
-* Public Launch Posts
-    * Reddit
-    * Twitter
-    * Kaggle
-
 * Protocol for annotating data. 
    * Promotional Material 
    * Stuff that is not news