first deploy

2025-12-17 02:54:25 +01:00 · 2024-06-30 00:32:19 +02:00
parent 876aebac2a
commit 4f8c435d62
13 changed files with 874 additions and 1 deletions
--- a/.github/workflows/build_deploy.yaml
+++ b/.github/workflows/build_deploy.yaml
@@ -0,0 +1,71 @@
+name: Build and Deploy to GKE
+
+on:
+  push:
+    branches:
+      - main
+
+env:
+  PROJECT_ID: ${{ secrets.GKE_PROJECT }} 
+  GKE_CLUSTER: llama-gke-cluster      # Cluster Name
+  GKE_ZONE: europe-west6-a                # Cluster zone 
+  DEPLOYMENT_NAME: llama-gke-deploy   # Deployment name
+  IMAGE: llama-app-gke-image              # Image Name
+
+jobs:
+  setup-build-publish-deploy:
+    name: Setup, Build, Publish, and Deploy
+    runs-on: ubuntu-latest
+    environment: production
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    # Setup gcloud CLI
+    - id: 'auth'
+      uses: 'google-github-actions/auth@v2'
+      with:
+        credentials_json: '${{ secrets.GKE_SA_KEY }}'
+
+    # Configure Docker to use the gcloud command-line tool as a credential
+    # helper for authentication
+    - run: |-
+        gcloud --quiet auth configure-docker
+
+    # Get the GKE credentials so we can deploy to the cluster
+    - uses: google-github-actions/get-gke-credentials@db150f2cc60d1716e61922b832eae71d2a45938f
+      with:
+        cluster_name: ${{ env.GKE_CLUSTER }}
+        location: ${{ env.GKE_ZONE }}
+        credentials: ${{ secrets.GKE_SA_KEY }}
+
+    # Build the Docker image
+    - name: Build
+      run: |-
+        docker build --no-cache \
+          --tag "gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA" \
+          --build-arg GITHUB_SHA="$GITHUB_SHA" \
+          --build-arg GITHUB_REF="$GITHUB_REF" \
+          .
+
+    # Push the Docker image to Google Container Registry
+    - name: Publish
+      run: |-
+        docker push "gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA"
+
+    # Set up kustomize
+    - name: Set up Kustomize
+      run: |-
+        curl -sfLo kustomize https://github.com/kubernetes-sigs/kustomize/releases/download/v3.1.0/kustomize_3.1.0_linux_amd64
+        chmod u+x ./kustomize 
+        kubectl create secret generic openai-secret --from-literal=OPENAI_API_KEY=${{secrets.OPENAI_API_KEY}} || true 
+        kubectl create secret generic qdrant-secret --from-literal=QDRANT_API_KEY=${{secrets.QDRANT_API_KEY}} --from-literal=QDRANT_URL=${{secrets.QDRANT_URL}} --from-literal=COLLECTION_NAME=${{secrets.COLLECTION_NAME}} || true 
+
+    # Deploy the Docker image to the GKE cluster
+    - name: Deploy
+      run: |-
+        ./kustomize edit set image gcr.io/PROJECT_ID/IMAGE:TAG=gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA
+        ./kustomize build . | kubectl apply -f -
+        kubectl rollout status deployment/$DEPLOYMENT_NAME
+        kubectl get services -o wide
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,36 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  lint-and-test:
+    runs-on: ubuntu-latest
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
+      QDRANT_URL: ${{ secrets.QDRANT_URL }}
+      COLLECTION_NAME: ${{ secrets.COLLECTION_NAME }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run linting
+        run: |
+          make lint
+
+      - name: Run tests
+        run: |
+          make test
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+FROM python:3.10
+
+WORKDIR /app
+
+# Copy application code
+COPY . .
+
+# Clear pip cache
+RUN pip cache purge
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Expose port
+EXPOSE 8000
+
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
+
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+# Makefile
+
+.PHONY: req lint clean
+
+# Variables
+PIP := pip
+RUFF := ruff
+
+all: req lint test clean ## Run all tasks
+
+req: ## Install the requirements
+	$(PIP) install -r requirements.txt
+
+lint: ## Run linter and code formatter (ruff)
+	$(RUFF) check . --fix  
+
+test: ## Run tests using pytest
+	pytest tests/
+
+clean: ## Clean up generated files
+	rm -rf __pycache__
--- a/README.md
+++ b/README.md
@@ -1 +1,191 @@
-# scale-gke-qdrant-llama
+# Q&A Pipeline Deployment on GKE for Scalability with LlamaIndex and Qdrant". 🚀
+
+<p align="center">
+  <img width="976" alt="aws" src="https://github.com/benitomartin/mlops-aws-insurance/assets/116911431/4bfeb7ce-b151-4042-8cf6-c83299a2765a">
+</p>
+
+This repository contains a full Q&A pipeline using the LlamaIndex framework, Qdrant as a vector database, and deployment on Google Kubernetes Engine (GKE) using a FastAPI app and Dockerfile. Python files from my repositories are loaded into the vector database, and the FastAPI app processes requests. The main goal is to provide fast access to your own code, enabling reuse of functions.
+
+For detailed project descriptions, refer to this [Medium article](XXX).
+
+Main Steps
+
+- **Data Ingestion**: Load data from GitHub repositories.
+- **Indexing**: Use SentenceSplitter for indexing in nodes.
+- **Embedding**: Implement FastEmbedEmbedding.
+- **Vector Store**: Use Qdrant for inserting metadata.
+- **Query Retrieval**: Implement RetrieverQueryEngine.
+- **FastAPI and GKE**: Handle requests via the FastAPI app deployed on GKE.
+- **Streamlit**: UI component.
+  
+Feel free to ⭐ and clone this repo 😉
+
+## Tech Stack
+
+![Visual Studio Code](https://img.shields.io/badge/Visual%20Studio%20Code-0078d7.svg?style=for-the-badge&logo=visual-studio-code&logoColor=white)
+![Jupyter Notebook](https://img.shields.io/badge/jupyter-%23FA0F00.svg?style=for-the-badge&logo=jupyter&logoColor=white)
+![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
+![OpenAI](https://img.shields.io/badge/OpenAI-74aa9c?style=for-the-badge&logo=openai&logoColor=white)
+![Anaconda](https://img.shields.io/badge/Anaconda-%2344A833.svg?style=for-the-badge&logo=anaconda&logoColor=white)
+![Linux](https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=white)
+![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white)
+![Google Cloud](https://img.shields.io/badge/GoogleCloud-%234285F4.svg?style=for-the-badge&logo=google-cloud&logoColor=white)
+![Kubernetes](https://img.shields.io/badge/kubernetes-%23326ce5.svg?style=for-the-badge&logo=kubernetes&logoColor=white)
+![FastAPI](https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi)
+![Git](https://img.shields.io/badge/git-%23F05033.svg?style=for-the-badge&logo=git&logoColor=white)
+![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white)
+![GitHub Actions](https://img.shields.io/badge/github%20actions-%232671E5.svg?style=for-the-badge&logo=githubactions&logoColor=white)
+![Streamlit](https://img.shields.io/badge/Streamlit-FF4B4B?style=for-the-badge&logo=Streamlit&logoColor=white)
+
+
+## Project Structure
+
+The project has been structured with the following files:
+
+- `.github/workflows:` CI/CD pipelines
+- `tests`: unittest
+- `Dockerfile:`Dockerfile
+- `Makefile`: install requirements, formating, linting, testing and clean up
+- `app.py:` FastAPI
+- `pyproject.toml`: linting and formatting using ruff
+- `create_qdrant_collection.py:` script to create the collection in Qdrant
+- `deploy-gke.yaml:` deployment function
+- `kustomization.yaml:` kustomize deployment function
+- `requirements.txt:` project requirements
+
+
+## Project Set Up
+
+The Python version used for this project is Python 3.10. You can follow along the medium article.
+
+1. Clone the repo (or download it as a zip file):
+
+   ```bash
+   git clone https://github.com/benitomartin/rag-aws-qdrant.git
+   ```
+
+2. Create the virtual environment named `main-env` using Conda with Python version 3.10:
+
+   ```bash
+   conda create -n main-env python=3.10
+   conda activate main-env
+   ```
+   
+3. Execute the `Makefile` script and install the project dependencies included in the requirements.txt:
+
+    ```bash
+    pip install -r requirements.txt
+
+    or
+ 
+    make install
+    ```
+
+4. You can test the app locally running:
+
+   ```bash
+   uvicorn app:app --host 0.0.0.0 --port 8000
+   ```
+
+   then go to one of these addresses
+   
+   http://localhost:8000/docs
+   http://127.0.0.1:8000/docs
+
+5. Create **GCP Account**, project, service account key, and activate GKE API
+
+6. Make sure the `.env` file is complete:
+
+   ```bash
+   OPENAI_API_KEY=
+   QDRANT_API_KEY=
+   QDRANT_URL=
+   COLLECTION_NAME=
+   ACCESS_TOKEN=
+   GITHUB_USERNAME=
+   ```
+
+7. Add the following secrets into github:
+   ```bash
+   OPENAI_API_KEY
+   QDRANT_API_KEY
+   QDRANT_URL
+    COLLECTION_NAME
+   GKE_SA_KEY
+   GKE_PROJECT # PROJECT_ID
+   ```
+
+8. Be sure to authenticate in GCP:
+    ```bash
+    gcloud auth login
+    ```
+
+    ```bash
+    gcloud config set project PROJECT_ID
+    ```
+
+9. Create Kubernetes Cluster
+
+    ```bash
+    gcloud container clusters create llama-gke-cluster \
+        --zone=europe-west6-a \
+        --num-nodes=5 \
+        --enable-autoscaling \
+        --min-nodes=1 \
+        --max-nodes=10 \
+        --machine-type=n1-standard-4 \
+        --enable-vertical-pod-autoscaling
+    ```
+
+    after creation check the nodes
+    
+    ```bash
+    kubectl get nodes
+    ```
+
+10. Push the GitHub Actions workflows to start the deployment
+
+11. Verify Kubernetes is running after deployment
+
+    ```bash
+    kubectl get po 
+    kubectl get svc 
+    ```
+
+    <p align="center">
+    <img width="940" alt="Pods Running" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/d4dee27d-383f-4375-9a21-29996a5b5089">
+    </p>
+
+    under svc the external ip is the endpoint (34.65.3.225), that can be added in the streamlit app
+
+    <p align="center">
+    <img width="767" alt="lambda-gke" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/b4a7e10c-52f9-4ca2-ade3-f2136ff6bbdf">
+    </p>
+
+    http://34.65.191.211:8000
+
+12. Check some pods and logs
+
+    ```bash
+    kubectl logs llama-app-gke-deploy-79bf48d7d8-4b77z
+    kubectl describe pod llama-app-gke-deploy-79bf48d7d8-4b77z
+    ```
+
+13. Clean up to avoid costs deleting the cluster and the docker image
+
+    ```bash
+    gcloud container clusters delete app-llama-gke-cluster --zone=europe-west6-a
+    kubectl delete deployment llama-app-gke-deploy
+    ```
+
+## Streamlit UI
+
+Run the streamlit app adding the endpoint url that you get after deployment:
+
+   ```bash
+   streamlit run streamlit_app.py
+   ```
+
+    <p align="center">
+    <img width="767" alt="lambda-gke" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/b4a7e10c-52f9-4ca2-ade3-f2136ff6bbdf">
+    </p>
--- a/app.py
+++ b/app.py
@@ -0,0 +1,131 @@
+"""Main application file for the FastAPI app."""
+
+import os
+
+import openai
+from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException
+from llama_index.core import PromptTemplate, get_response_synthesizer
+from llama_index.core.indices.vector_store.base import VectorStoreIndex
+from llama_index.core.postprocessor import SentenceTransformerRerank
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from pydantic import BaseModel
+from qdrant_client import QdrantClient
+
+# Load environmental variables from .env file
+load_dotenv()
+
+# FastAPI initialization
+app = FastAPI()
+
+# Configuration parameters from environment variables
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
+QDRANT_URL = os.getenv('QDRANT_URL')
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+openai.api_key = OPENAI_API_KEY
+
+# Set OpenAI API key
+if OPENAI_API_KEY is None:
+    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
+
+# Initialize Qdrant client
+client = QdrantClient(
+    url=QDRANT_URL,
+    api_key=QDRANT_API_KEY
+)
+
+# Initialize OpenAIEmbedding embedding model
+embed_model = OpenAIEmbedding(openai_api_key=OPENAI_API_KEY)
+
+# Define the query model
+class QueryRequest(BaseModel):
+
+    """Request model for querying the vector store."""
+
+    query: str
+
+# Initialize Qdrant Vector Store
+vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME, embed_model=embed_model)
+
+# Initialize Vector Store Index
+index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
+
+# Define the prompt template for querying
+qa_prompt_tmpl_str = """\
+Context information is below.
+---------------------
+{context_str}
+---------------------
+
+Given the context information and not prior knowledge, \
+answer the query. Please be concise, and complete. \
+If the context does not contain an answer to the query \
+respond with I don't know!
+
+Query: {query_str}
+Answer: \
+"""
+qa_prompt = PromptTemplate(qa_prompt_tmpl_str)
+
+# Initialize Retriever
+retriever = VectorIndexRetriever(index=index)
+
+# Initialize Response Synthesizer
+response_synthesizer = get_response_synthesizer(
+    text_qa_template=qa_prompt,
+)
+
+# Initialize Sentence Reranker for query response
+rerank = SentenceTransformerRerank(
+    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
+)
+
+# Initialize RetrieverQueryEngine for query processing
+query_engine = RetrieverQueryEngine(
+    retriever=retriever,
+    response_synthesizer=response_synthesizer,
+    node_postprocessors=[rerank]
+)
+
+@app.post("/query/")
+async def query_vector_store(request: QueryRequest):
+    """
+    Endpoint for querying the vector store.
+
+    Args:
+    ----
+    request (QueryRequest): The query request model.
+
+    Returns:
+    -------
+    str: Cleaned response to the query from the vector store.
+
+    Raises:
+    ------
+    HTTPException:
+        If no response is found.
+
+    """
+    query = request.query
+    response = query_engine.query(query)
+    if not response:
+        raise HTTPException(status_code=404, detail="No response found")
+
+    # Remove newline characters from the response
+    cleaned_response = response.response.replace("\n", "")
+
+    return cleaned_response
+
+@app.get("/")
+def read_root():
+    """Root endpoint returning a simple message."""
+    return {"message": "GKE App V0"}
+
+# Run the app using `uvicorn` if this file is executed directly
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/create_qdrant_collection.py
+++ b/create_qdrant_collection.py
@@ -0,0 +1,217 @@
+"""Script for creating a Qdrant collection."""
+
+import os
+from uuid import uuid4
+
+import openai
+from dotenv import load_dotenv
+from github import Github
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.readers.github import GithubClient, GithubRepositoryReader
+from qdrant_client import QdrantClient
+from qdrant_client.http.exceptions import ResponseHandlingException
+from qdrant_client.models import Distance, PointStruct, VectorParams
+
+# Load environmental variables from a .env file
+load_dotenv()
+
+QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
+QDRANT_URL = os.getenv('QDRANT_URL')
+COLLECTION_NAME = os.getenv('COLLECTION_NAME')
+GITHUB_USERNAME  = os.getenv('GITHUB_USERNAME')
+ACCESS_TOKEN  = os.getenv('ACCESS_TOKEN')
+
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+openai.api_key = OPENAI_API_KEY
+
+
+def get_repository_list(github_token, github_username):
+    """
+    Fetch all repositories for a given GitHub user.
+
+    Args:
+    ----
+    github_token (str): GitHub access token.
+    github_username (str): GitHub username.
+
+    Returns:
+    -------
+    list: List of documents fetched from the user's repositories.
+
+    """
+    try:
+        # Initialize Github client
+        g = Github(github_token)
+
+        # Fetch all repositories for the user
+        repos = g.get_user(github_username).get_repos()
+
+        github_client = GithubClient(github_token=github_token, verbose=True)
+
+        all_documents = []
+
+        for repo in repos:
+            repo_name = repo.full_name
+            print(f"Loading files from {repo_name}")
+
+            # Check if the repository belongs to the user
+            if repo.owner.login != github_username:
+                print(f"Skipping repository {repo_name} as it does not belong to the user.")
+                continue
+
+            try:
+                # Determine the default branch
+                default_branch = repo.default_branch
+
+                # Load documents from the repository
+                documents = GithubRepositoryReader(
+                    github_client=github_client,
+                    owner=github_username,
+                    repo=repo.name,
+                    use_parser=False,
+                    verbose=False,
+                    filter_file_extensions=(
+                        [".py"],
+                        GithubRepositoryReader.FilterType.INCLUDE,
+                    ),
+                ).load_data(branch=default_branch)
+
+                # Ensure each document has text content
+                for doc in documents:
+                    if doc.text and doc.text.strip():
+                        all_documents.append(doc)
+                    else:
+                        print(f"Skipping empty document: {doc.metadata['file_path']}")
+
+            except Exception as e:
+                print(f"Failed to load {repo_name}: {e}")
+
+    except Exception as e:
+        print(f"Error fetching repositories: {e}")
+
+    return all_documents
+
+
+def split_documents_into_nodes(all_documents):
+    """
+    Split documents into nodes using SentenceSplitter.
+
+    Args:
+    ----
+    all_documents (list): List of Document objects.
+
+    Returns:
+    -------
+    list: List of nodes extracted from documents.
+
+    """
+    try:
+        splitter = SentenceSplitter(
+            chunk_size=1500,
+            chunk_overlap=200
+        )
+
+        nodes = splitter.get_nodes_from_documents(all_documents)
+
+        return nodes
+
+    except Exception as e:
+        print(f"Error splitting documents into nodes: {e}")
+        return []
+
+def create_collection_if_not_exists(client, collection_name):
+    """
+    Create a Qdrant collection if it does not already exist.
+
+    Args:
+    ----
+    client (QdrantClient): The Qdrant client instance.
+    collection_name (str): The name of the collection.
+
+    """
+    try:
+        collections = client.get_collections()
+        if collection_name not in [col.name for col in collections.collections]:
+            client.create_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
+            )
+
+            print(f"Collection '{collection_name}' created.")
+        else:
+            print(f"Collection '{collection_name}' already exists.")
+    except ResponseHandlingException as e:
+        print(f"Error checking or creating collection: {e}")
+
+
+def chunked_nodes(data, client, collection_name):
+    """
+    Process and upsert chunked metadata into Qdrant.
+
+    Args:
+    ----
+    data (list): The list of document chunks.
+    client (QdrantClient): The Qdrant client instance.
+    collection_name (str): The name of the collection.
+
+    """
+    chunked_nodes = []
+
+    for item in data:
+        qdrant_id = str(uuid4())
+        document_id = item.id_
+        code_text = item.text
+        source = item.metadata["url"]
+        file_name = item.metadata["file_name"]
+
+        content_vector = embed_model.get_text_embedding(code_text)
+
+        payload = {
+            "text": code_text,
+            "document_id": document_id,
+            "metadata": {
+                            "qdrant_id": qdrant_id,
+                            "source": source,
+                            "file_name": file_name,
+                            }
+                }
+
+
+        metadata = PointStruct(id=qdrant_id, vector=content_vector, payload=payload)
+
+        chunked_nodes.append(metadata)
+
+    if chunked_nodes:
+        client.upsert(
+            collection_name=collection_name,
+            wait=True,
+            points=chunked_nodes
+        )
+
+    print(f"{len(chunked_nodes)} Chunked metadata upserted.")
+
+
+if __name__ == "__main__":
+    # Fetch documents from GitHub repositories
+    all_documents = get_repository_list(ACCESS_TOKEN, GITHUB_USERNAME)
+
+    if all_documents:
+        # Split documents into nodes
+        nodes = split_documents_into_nodes(all_documents)
+
+        # Initialize embedding model
+        embed_model = OpenAIEmbedding(openai_api_key=OPENAI_API_KEY)
+
+        # Initialize Qdrant client
+        client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
+
+        # Create collection if it does not exist
+        create_collection_if_not_exists(client, COLLECTION_NAME)
+
+        # Upsert documents in vector store
+        chunked_nodes(nodes[:2], client, COLLECTION_NAME)
+    else:
+        print("No documents to process.")
+
+
--- a/deploy-gke.yaml
+++ b/deploy-gke.yaml
@@ -0,0 +1,110 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-gke-deploy
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: llama-gke-pod
+  template:
+    metadata:
+      labels:
+        app: llama-gke-pod
+    spec:
+      containers:
+      - name: llama-gke-container
+        image: gcr.io/PROJECT_ID/IMAGE:TAG
+        ports:
+        - containerPort: 8000     # Port inside the container where the FastAPI app is running
+        env:
+        - name: OPENAI_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: openai-secret
+              key: OPENAI_API_KEY        
+        - name: QDRANT_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: qdrant-secret
+              key: QDRANT_API_KEY
+        - name: QDRANT_URL
+          valueFrom:
+            secretKeyRef:
+              name: qdrant-secret
+              key: QDRANT_URL
+        - name: COLLECTION_NAME
+          valueFrom:
+            secretKeyRef:
+              name: qdrant-secret
+              key: COLLECTION_NAME
+        resources:
+          requests:               # Minimum resources required.
+            memory: "2Gi"
+            cpu: "1"
+          limits:                 # Maximum resources allowed
+            memory: "4Gi"
+            cpu: "2"
+        readinessProbe:           # Check if the pod is ready to serve traffic.
+          httpGet:
+            scheme: HTTP
+            path: /
+            port: 8000              # Port for readiness probe (should match containerPort)
+          initialDelaySeconds: 240  # Delay before first probe is executed
+          periodSeconds: 60         # Interval between probes
+        livenessProbe:              # Check if the pod is alive
+          httpGet:
+            scheme: HTTP
+            path: /
+            port: 8000              # Port for liveness probe (should match containerPort)
+          initialDelaySeconds: 240  # Delay before first probe is executed
+          periodSeconds: 60         # Interval between probes
+
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: mylb
+spec:
+  type: LoadBalancer
+  selector:
+    app: llama-gke-pod
+  ports:
+  - port: 8000                    # Port exposed by the Kubernetes service (ccould be 80 as well)
+    targetPort: 8000              # Port where the service forwards traffic to (should match containerPort)
+
+# Vertical scaling
+---
+apiVersion: autoscaling.k8s.io/v1
+kind: VerticalPodAutoscaler
+metadata:
+  name: llama-gke-deploy-vpa
+spec:
+  targetRef:
+    apiVersion: "apps/v1"
+    kind:       Deployment
+    name:       llama-gke-deploy
+  updatePolicy:                   # Policy for updating the resource requests and limits
+    updateMode: "Auto"            # Automatically update the resource requests and limits
+
+# Horizontal scaling
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: llama-gke-deploy-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: llama-gke-deploy
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+  - type: Resource                # Type of metric
+    resource:                     # Resource-based metric
+      name: cpu                   # Metric name
+      target:
+        type: Utilization         # Type of target value
+        averageUtilization: 70    # Average CPU utilization percentage to maintain.
--- a/kustomization.yaml
+++ b/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+  
+resources:
+  - deploy-gke.yaml
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.ruff]
+# Define maximum line length
+line-length = 88
+
+[tool.ruff.lint]
+# Define the rules to enforce, including flake8 and isort rules
+select = [
+    "E",  # Error codes (from flake8)
+    "F",  # Failures (from flake8)
+    "W",  # Warnings (from flake8)
+    "C90",  # Custom/Specific categories
+    "I",  # Import sorting (from isort)
+    "D",  # Docstring conventions (from pydocstyle)
+]
+
+# Exclude specific error codes
+ignore = ["E501", "D211", "D212"]
+
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+fastapi
+uvicorn
+openai
+torch
+sentence-transformers 
+PyGithub
+python-dotenv
+qdrant-client
+llama-index  
+llama-index-embeddings-openai
+llama-index-vector-stores-qdrant
+llama-index-readers-github 
+ruff
+streamlit
+pytest
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,6 @@
+# tests/__init__.py
+"""
+Tests for the application.
+
+This module contains unit tests for various components of the application.
+"""
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -0,0 +1,33 @@
+"""
+Unit tests for the FastAPI application.
+
+These tests verify the behavior of different endpoints.
+"""
+
+from fastapi.testclient import TestClient
+
+from app import app
+
+client = TestClient(app)
+
+def test_read_root():
+    """
+    Test the root endpoint ("/").
+
+    Checks that the root endpoint returns a status code of 200 and a message.
+    """
+    response = client.get("/")
+    assert response.status_code == 200
+    assert response.json() == {"message": "GKE App V0"}
+
+
+def test_query_vector_store_valid():
+    """
+    Test the vector store query endpoint ("/query/").
+
+    Checks that the endpoint returns a valid response for a valid query payload.
+    """
+    valid_payload = {"query": "Enter query string."}
+    response = client.post("/query/", json=valid_payload)
+    assert response.status_code == 200
+    assert isinstance(response.json(), str)