first deploy

2025-12-17 02:54:25 +01:00 · 2024-06-30 00:32:19 +02:00
parent 876aebac2a
commit 4f8c435d62
13 changed files with 874 additions and 1 deletions
--- a/.github/workflows/build_deploy.yaml
+++ b/.github/workflows/build_deploy.yaml
@@ -0,0 +1,71 @@
 name: Build and Deploy to GKE
 on:
  push:
    branches:
      - main
 env:
  PROJECT_ID: ${{ secrets.GKE_PROJECT }} 
  GKE_CLUSTER: llama-gke-cluster      # Cluster Name
  GKE_ZONE: europe-west6-a                # Cluster zone 
  DEPLOYMENT_NAME: llama-gke-deploy   # Deployment name
  IMAGE: llama-app-gke-image              # Image Name
 jobs:
  setup-build-publish-deploy:
    name: Setup, Build, Publish, and Deploy
    runs-on: ubuntu-latest
    environment: production
    steps:
    - name: Checkout
      uses: actions/checkout@v4
    # Setup gcloud CLI
    - id: 'auth'
      uses: 'google-github-actions/auth@v2'
      with:
        credentials_json: '${{ secrets.GKE_SA_KEY }}'
    # Configure Docker to use the gcloud command-line tool as a credential
    # helper for authentication
    - run: |-
        gcloud --quiet auth configure-docker
    # Get the GKE credentials so we can deploy to the cluster
    - uses: google-github-actions/get-gke-credentials@db150f2cc60d1716e61922b832eae71d2a45938f
      with:
        cluster_name: ${{ env.GKE_CLUSTER }}
        location: ${{ env.GKE_ZONE }}
        credentials: ${{ secrets.GKE_SA_KEY }}
    # Build the Docker image
    - name: Build
      run: |-
        docker build --no-cache \
          --tag "gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA" \
          --build-arg GITHUB_SHA="$GITHUB_SHA" \
          --build-arg GITHUB_REF="$GITHUB_REF" \
          .
    # Push the Docker image to Google Container Registry
    - name: Publish
      run: |-
        docker push "gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA"
    # Set up kustomize
    - name: Set up Kustomize
      run: |-
        curl -sfLo kustomize https://github.com/kubernetes-sigs/kustomize/releases/download/v3.1.0/kustomize_3.1.0_linux_amd64
        chmod u+x ./kustomize 
        kubectl create secret generic openai-secret --from-literal=OPENAI_API_KEY=${{secrets.OPENAI_API_KEY}} || true 
        kubectl create secret generic qdrant-secret --from-literal=QDRANT_API_KEY=${{secrets.QDRANT_API_KEY}} --from-literal=QDRANT_URL=${{secrets.QDRANT_URL}} --from-literal=COLLECTION_NAME=${{secrets.COLLECTION_NAME}} || true 
    # Deploy the Docker image to the GKE cluster
    - name: Deploy
      run: |-
        ./kustomize edit set image gcr.io/PROJECT_ID/IMAGE:TAG=gcr.io/$PROJECT_ID/$IMAGE:$GITHUB_SHA
        ./kustomize build . | kubectl apply -f -
        kubectl rollout status deployment/$DEPLOYMENT_NAME
        kubectl get services -o wide
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,36 @@
 name: Continuous Integration
 on:
  push:
    branches:
      - main
 jobs:
  lint-and-test:
    runs-on: ubuntu-latest
    env:
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }}
      QDRANT_URL: ${{ secrets.QDRANT_URL }}
      COLLECTION_NAME: ${{ secrets.COLLECTION_NAME }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2
      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: 3.9
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Run linting
        run: |
          make lint
      - name: Run tests
        run: |
          make test
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
 FROM python:3.10
 WORKDIR /app
 # Copy application code
 COPY . .
 # Clear pip cache
 RUN pip cache purge
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Expose port
 EXPOSE 8000
 # Command to run the application
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 # Makefile
 .PHONY: req lint clean
 # Variables
 PIP := pip
 RUFF := ruff
 all: req lint test clean ## Run all tasks
 req: ## Install the requirements
 	$(PIP) install -r requirements.txt
 lint: ## Run linter and code formatter (ruff)
 	$(RUFF) check . --fix  
 test: ## Run tests using pytest
 	pytest tests/
 clean: ## Clean up generated files
 	rm -rf __pycache__
--- a/README.md
+++ b/README.md
@@ -1 +1,191 @@
-# scale-gke-qdrant-llama
+# Q&A Pipeline Deployment on GKE for Scalability with LlamaIndex and Qdrant". 🚀
 <p align="center">
  <img width="976" alt="aws" src="https://github.com/benitomartin/mlops-aws-insurance/assets/116911431/4bfeb7ce-b151-4042-8cf6-c83299a2765a">
 </p>
 This repository contains a full Q&A pipeline using the LlamaIndex framework, Qdrant as a vector database, and deployment on Google Kubernetes Engine (GKE) using a FastAPI app and Dockerfile. Python files from my repositories are loaded into the vector database, and the FastAPI app processes requests. The main goal is to provide fast access to your own code, enabling reuse of functions.
 For detailed project descriptions, refer to this [Medium article](XXX).
 Main Steps
 - **Data Ingestion**: Load data from GitHub repositories.
 - **Indexing**: Use SentenceSplitter for indexing in nodes.
 - **Embedding**: Implement FastEmbedEmbedding.
 - **Vector Store**: Use Qdrant for inserting metadata.
 - **Query Retrieval**: Implement RetrieverQueryEngine.
 - **FastAPI and GKE**: Handle requests via the FastAPI app deployed on GKE.
 - **Streamlit**: UI component.
 Feel free to ⭐ and clone this repo 😉
 ## Tech Stack
 ![Visual Studio Code](https://img.shields.io/badge/Visual%20Studio%20Code-0078d7.svg?style=for-the-badge&logo=visual-studio-code&logoColor=white)
 ![Jupyter Notebook](https://img.shields.io/badge/jupyter-%23FA0F00.svg?style=for-the-badge&logo=jupyter&logoColor=white)
 ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
 ![OpenAI](https://img.shields.io/badge/OpenAI-74aa9c?style=for-the-badge&logo=openai&logoColor=white)
 ![Anaconda](https://img.shields.io/badge/Anaconda-%2344A833.svg?style=for-the-badge&logo=anaconda&logoColor=white)
 ![Linux](https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=white)
 ![Ubuntu](https://img.shields.io/badge/Ubuntu-E95420?style=for-the-badge&logo=ubuntu&logoColor=white)
 ![Google Cloud](https://img.shields.io/badge/GoogleCloud-%234285F4.svg?style=for-the-badge&logo=google-cloud&logoColor=white)
 ![Kubernetes](https://img.shields.io/badge/kubernetes-%23326ce5.svg?style=for-the-badge&logo=kubernetes&logoColor=white)
 ![FastAPI](https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi)
 ![Git](https://img.shields.io/badge/git-%23F05033.svg?style=for-the-badge&logo=git&logoColor=white)
 ![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white)
 ![GitHub Actions](https://img.shields.io/badge/github%20actions-%232671E5.svg?style=for-the-badge&logo=githubactions&logoColor=white)
 ![Streamlit](https://img.shields.io/badge/Streamlit-FF4B4B?style=for-the-badge&logo=Streamlit&logoColor=white)
 ## Project Structure
 The project has been structured with the following files:
 - `.github/workflows:` CI/CD pipelines
 - `tests`: unittest
 - `Dockerfile:`Dockerfile
 - `Makefile`: install requirements, formating, linting, testing and clean up
 - `app.py:` FastAPI
 - `pyproject.toml`: linting and formatting using ruff
 - `create_qdrant_collection.py:` script to create the collection in Qdrant
 - `deploy-gke.yaml:` deployment function
 - `kustomization.yaml:` kustomize deployment function
 - `requirements.txt:` project requirements
 ## Project Set Up
 The Python version used for this project is Python 3.10. You can follow along the medium article.
 1. Clone the repo (or download it as a zip file):
   ```bash
   git clone https://github.com/benitomartin/rag-aws-qdrant.git
   ```
 2. Create the virtual environment named `main-env` using Conda with Python version 3.10:
   ```bash
   conda create -n main-env python=3.10
   conda activate main-env
   ```
 3. Execute the `Makefile` script and install the project dependencies included in the requirements.txt:
    ```bash
    pip install -r requirements.txt
    or
    make install
    ```
 4. You can test the app locally running:
   ```bash
   uvicorn app:app --host 0.0.0.0 --port 8000
   ```
   then go to one of these addresses
   http://localhost:8000/docs
   http://127.0.0.1:8000/docs
 5. Create **GCP Account**, project, service account key, and activate GKE API
 6. Make sure the `.env` file is complete:
   ```bash
   OPENAI_API_KEY=
   QDRANT_API_KEY=
   QDRANT_URL=
   COLLECTION_NAME=
   ACCESS_TOKEN=
   GITHUB_USERNAME=
   ```
 7. Add the following secrets into github:
   ```bash
   OPENAI_API_KEY
   QDRANT_API_KEY
   QDRANT_URL
    COLLECTION_NAME
   GKE_SA_KEY
   GKE_PROJECT # PROJECT_ID
   ```
 8. Be sure to authenticate in GCP:
    ```bash
    gcloud auth login
    ```
    ```bash
    gcloud config set project PROJECT_ID
    ```
 9. Create Kubernetes Cluster
    ```bash
    gcloud container clusters create llama-gke-cluster \
        --zone=europe-west6-a \
        --num-nodes=5 \
        --enable-autoscaling \
        --min-nodes=1 \
        --max-nodes=10 \
        --machine-type=n1-standard-4 \
        --enable-vertical-pod-autoscaling
    ```
    after creation check the nodes
    ```bash
    kubectl get nodes
    ```
 10. Push the GitHub Actions workflows to start the deployment
 11. Verify Kubernetes is running after deployment
    ```bash
    kubectl get po 
    kubectl get svc 
    ```
    <p align="center">
    <img width="940" alt="Pods Running" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/d4dee27d-383f-4375-9a21-29996a5b5089">
    </p>
    under svc the external ip is the endpoint (34.65.3.225), that can be added in the streamlit app
    <p align="center">
    <img width="767" alt="lambda-gke" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/b4a7e10c-52f9-4ca2-ade3-f2136ff6bbdf">
    </p>
    http://34.65.191.211:8000
 12. Check some pods and logs
    ```bash
    kubectl logs llama-app-gke-deploy-79bf48d7d8-4b77z
    kubectl describe pod llama-app-gke-deploy-79bf48d7d8-4b77z
    ```
 13. Clean up to avoid costs deleting the cluster and the docker image
    ```bash
    gcloud container clusters delete app-llama-gke-cluster --zone=europe-west6-a
    kubectl delete deployment llama-app-gke-deploy
    ```
 ## Streamlit UI
 Run the streamlit app adding the endpoint url that you get after deployment:
   ```bash
   streamlit run streamlit_app.py
   ```
    <p align="center">
    <img width="767" alt="lambda-gke" src="https://github.com/benitomartin/mlops-car-prices/assets/116911431/b4a7e10c-52f9-4ca2-ade3-f2136ff6bbdf">
    </p>
--- a/app.py
+++ b/app.py
@@ -0,0 +1,131 @@
 """Main application file for the FastAPI app."""
 import os
 import openai
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException
 from llama_index.core import PromptTemplate, get_response_synthesizer
 from llama_index.core.indices.vector_store.base import VectorStoreIndex
 from llama_index.core.postprocessor import SentenceTransformerRerank
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.vector_stores.qdrant import QdrantVectorStore
 from pydantic import BaseModel
 from qdrant_client import QdrantClient
 # Load environmental variables from .env file
 load_dotenv()
 # FastAPI initialization
 app = FastAPI()
 # Configuration parameters from environment variables
 COLLECTION_NAME = os.getenv("COLLECTION_NAME")
 QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
 QDRANT_URL = os.getenv('QDRANT_URL')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 openai.api_key = OPENAI_API_KEY
 # Set OpenAI API key
 if OPENAI_API_KEY is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
 # Initialize Qdrant client
 client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
 )
 # Initialize OpenAIEmbedding embedding model
 embed_model = OpenAIEmbedding(openai_api_key=OPENAI_API_KEY)
 # Define the query model
 class QueryRequest(BaseModel):
    """Request model for querying the vector store."""
    query: str
 # Initialize Qdrant Vector Store
 vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME, embed_model=embed_model)
 # Initialize Vector Store Index
 index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
 # Define the prompt template for querying
 qa_prompt_tmpl_str = """\
 Context information is below.
 ---------------------
 {context_str}
 ---------------------
 Given the context information and not prior knowledge, \
 answer the query. Please be concise, and complete. \
 If the context does not contain an answer to the query \
 respond with I don't know!
 Query: {query_str}
 Answer: \
 """
 qa_prompt = PromptTemplate(qa_prompt_tmpl_str)
 # Initialize Retriever
 retriever = VectorIndexRetriever(index=index)
 # Initialize Response Synthesizer
 response_synthesizer = get_response_synthesizer(
    text_qa_template=qa_prompt,
 )
 # Initialize Sentence Reranker for query response
 rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
 )
 # Initialize RetrieverQueryEngine for query processing
 query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[rerank]
 )
@app.post("/query/")
 async def query_vector_store(request: QueryRequest):
    """
    Endpoint for querying the vector store.
    Args:
    ----
    request (QueryRequest): The query request model.
    Returns:
    -------
    str: Cleaned response to the query from the vector store.
    Raises:
    ------
    HTTPException:
        If no response is found.
    """
    query = request.query
    response = query_engine.query(query)
    if not response:
        raise HTTPException(status_code=404, detail="No response found")
    # Remove newline characters from the response
    cleaned_response = response.response.replace("\n", "")
    return cleaned_response
@app.get("/")
 def read_root():
    """Root endpoint returning a simple message."""
    return {"message": "GKE App V0"}
 # Run the app using `uvicorn` if this file is executed directly
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/create_qdrant_collection.py
+++ b/create_qdrant_collection.py
@@ -0,0 +1,217 @@
 """Script for creating a Qdrant collection."""
 import os
 from uuid import uuid4
 import openai
 from dotenv import load_dotenv
 from github import Github
 from llama_index.core.node_parser import SentenceSplitter
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.readers.github import GithubClient, GithubRepositoryReader
 from qdrant_client import QdrantClient
 from qdrant_client.http.exceptions import ResponseHandlingException
 from qdrant_client.models import Distance, PointStruct, VectorParams
 # Load environmental variables from a .env file
 load_dotenv()
 QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
 QDRANT_URL = os.getenv('QDRANT_URL')
 COLLECTION_NAME = os.getenv('COLLECTION_NAME')
 GITHUB_USERNAME  = os.getenv('GITHUB_USERNAME')
 ACCESS_TOKEN  = os.getenv('ACCESS_TOKEN')
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 openai.api_key = OPENAI_API_KEY
 def get_repository_list(github_token, github_username):
    """
    Fetch all repositories for a given GitHub user.
    Args:
    ----
    github_token (str): GitHub access token.
    github_username (str): GitHub username.
    Returns:
    -------
    list: List of documents fetched from the user's repositories.
    """
    try:
        # Initialize Github client
        g = Github(github_token)
        # Fetch all repositories for the user
        repos = g.get_user(github_username).get_repos()
        github_client = GithubClient(github_token=github_token, verbose=True)
        all_documents = []
        for repo in repos:
            repo_name = repo.full_name
            print(f"Loading files from {repo_name}")
            # Check if the repository belongs to the user
            if repo.owner.login != github_username:
                print(f"Skipping repository {repo_name} as it does not belong to the user.")
                continue
            try:
                # Determine the default branch
                default_branch = repo.default_branch
                # Load documents from the repository
                documents = GithubRepositoryReader(
                    github_client=github_client,
                    owner=github_username,
                    repo=repo.name,
                    use_parser=False,
                    verbose=False,
                    filter_file_extensions=(
                        [".py"],
                        GithubRepositoryReader.FilterType.INCLUDE,
                    ),
                ).load_data(branch=default_branch)
                # Ensure each document has text content
                for doc in documents:
                    if doc.text and doc.text.strip():
                        all_documents.append(doc)
                    else:
                        print(f"Skipping empty document: {doc.metadata['file_path']}")
            except Exception as e:
                print(f"Failed to load {repo_name}: {e}")
    except Exception as e:
        print(f"Error fetching repositories: {e}")
    return all_documents
 def split_documents_into_nodes(all_documents):
    """
    Split documents into nodes using SentenceSplitter.
    Args:
    ----
    all_documents (list): List of Document objects.
    Returns:
    -------
    list: List of nodes extracted from documents.
    """
    try:
        splitter = SentenceSplitter(
            chunk_size=1500,
            chunk_overlap=200
        )
        nodes = splitter.get_nodes_from_documents(all_documents)
        return nodes
    except Exception as e:
        print(f"Error splitting documents into nodes: {e}")
        return []
 def create_collection_if_not_exists(client, collection_name):
    """
    Create a Qdrant collection if it does not already exist.
    Args:
    ----
    client (QdrantClient): The Qdrant client instance.
    collection_name (str): The name of the collection.
    """
    try:
        collections = client.get_collections()
        if collection_name not in [col.name for col in collections.collections]:
            client.create_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
            )
            print(f"Collection '{collection_name}' created.")
        else:
            print(f"Collection '{collection_name}' already exists.")
    except ResponseHandlingException as e:
        print(f"Error checking or creating collection: {e}")
 def chunked_nodes(data, client, collection_name):
    """
    Process and upsert chunked metadata into Qdrant.
    Args:
    ----
    data (list): The list of document chunks.
    client (QdrantClient): The Qdrant client instance.
    collection_name (str): The name of the collection.
    """
    chunked_nodes = []
    for item in data:
        qdrant_id = str(uuid4())
        document_id = item.id_
        code_text = item.text
        source = item.metadata["url"]
        file_name = item.metadata["file_name"]
        content_vector = embed_model.get_text_embedding(code_text)
        payload = {
            "text": code_text,
            "document_id": document_id,
            "metadata": {
                            "qdrant_id": qdrant_id,
                            "source": source,
                            "file_name": file_name,
                            }
                }
        metadata = PointStruct(id=qdrant_id, vector=content_vector, payload=payload)
        chunked_nodes.append(metadata)
    if chunked_nodes:
        client.upsert(
            collection_name=collection_name,
            wait=True,
            points=chunked_nodes
        )
    print(f"{len(chunked_nodes)} Chunked metadata upserted.")
 if __name__ == "__main__":
    # Fetch documents from GitHub repositories
    all_documents = get_repository_list(ACCESS_TOKEN, GITHUB_USERNAME)
    if all_documents:
        # Split documents into nodes
        nodes = split_documents_into_nodes(all_documents)
        # Initialize embedding model
        embed_model = OpenAIEmbedding(openai_api_key=OPENAI_API_KEY)
        # Initialize Qdrant client
        client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
        # Create collection if it does not exist
        create_collection_if_not_exists(client, COLLECTION_NAME)
        # Upsert documents in vector store
        chunked_nodes(nodes[:2], client, COLLECTION_NAME)
    else:
        print("No documents to process.")
--- a/deploy-gke.yaml
+++ b/deploy-gke.yaml
@@ -0,0 +1,110 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-gke-deploy
 spec:
  replicas: 2
  selector:
    matchLabels:
      app: llama-gke-pod
  template:
    metadata:
      labels:
        app: llama-gke-pod
    spec:
      containers:
      - name: llama-gke-container
        image: gcr.io/PROJECT_ID/IMAGE:TAG
        ports:
        - containerPort: 8000     # Port inside the container where the FastAPI app is running
        env:
        - name: OPENAI_API_KEY
          valueFrom:
            secretKeyRef:
              name: openai-secret
              key: OPENAI_API_KEY        
        - name: QDRANT_API_KEY
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: QDRANT_API_KEY
        - name: QDRANT_URL
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: QDRANT_URL
        - name: COLLECTION_NAME
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: COLLECTION_NAME
        resources:
          requests:               # Minimum resources required.
            memory: "2Gi"
            cpu: "1"
          limits:                 # Maximum resources allowed
            memory: "4Gi"
            cpu: "2"
        readinessProbe:           # Check if the pod is ready to serve traffic.
          httpGet:
            scheme: HTTP
            path: /
            port: 8000              # Port for readiness probe (should match containerPort)
          initialDelaySeconds: 240  # Delay before first probe is executed
          periodSeconds: 60         # Interval between probes
        livenessProbe:              # Check if the pod is alive
          httpGet:
            scheme: HTTP
            path: /
            port: 8000              # Port for liveness probe (should match containerPort)
          initialDelaySeconds: 240  # Delay before first probe is executed
          periodSeconds: 60         # Interval between probes
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: mylb
 spec:
  type: LoadBalancer
  selector:
    app: llama-gke-pod
  ports:
  - port: 8000                    # Port exposed by the Kubernetes service (ccould be 80 as well)
    targetPort: 8000              # Port where the service forwards traffic to (should match containerPort)
 # Vertical scaling
 ---
 apiVersion: autoscaling.k8s.io/v1
 kind: VerticalPodAutoscaler
 metadata:
  name: llama-gke-deploy-vpa
 spec:
  targetRef:
    apiVersion: "apps/v1"
    kind:       Deployment
    name:       llama-gke-deploy
  updatePolicy:                   # Policy for updating the resource requests and limits
    updateMode: "Auto"            # Automatically update the resource requests and limits
 # Horizontal scaling
 ---
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
  name: llama-gke-deploy-hpa
 spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llama-gke-deploy
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource                # Type of metric
    resource:                     # Resource-based metric
      name: cpu                   # Metric name
      target:
        type: Utilization         # Type of target value
        averageUtilization: 70    # Average CPU utilization percentage to maintain.
--- a/kustomization.yaml
+++ b/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - deploy-gke.yaml
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
 [tool.ruff]
 # Define maximum line length
 line-length = 88
 [tool.ruff.lint]
 # Define the rules to enforce, including flake8 and isort rules
 select = [
    "E",  # Error codes (from flake8)
    "F",  # Failures (from flake8)
    "W",  # Warnings (from flake8)
    "C90",  # Custom/Specific categories
    "I",  # Import sorting (from isort)
    "D",  # Docstring conventions (from pydocstyle)
 ]
 # Exclude specific error codes
 ignore = ["E501", "D211", "D212"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,15 @@
 fastapi
 uvicorn
 openai
 torch
 sentence-transformers 
 PyGithub
 python-dotenv
 qdrant-client
 llama-index  
 llama-index-embeddings-openai
 llama-index-vector-stores-qdrant
 llama-index-readers-github 
 ruff
 streamlit
 pytest
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,6 @@
 # tests/__init__.py
 """
 Tests for the application.
 This module contains unit tests for various components of the application.
 """
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -0,0 +1,33 @@
 """
 Unit tests for the FastAPI application.
 These tests verify the behavior of different endpoints.
 """
 from fastapi.testclient import TestClient
 from app import app
 client = TestClient(app)
 def test_read_root():
    """
    Test the root endpoint ("/").
    Checks that the root endpoint returns a status code of 200 and a message.
    """
    response = client.get("/")
    assert response.status_code == 200
    assert response.json() == {"message": "GKE App V0"}
 def test_query_vector_store_valid():
    """
    Test the vector store query endpoint ("/query/").
    Checks that the endpoint returns a valid response for a valid query payload.
    """
    valid_payload = {"query": "Enter query string."}
    response = client.post("/query/", json=valid_payload)
    assert response.status_code == 200
    assert isinstance(response.json(), str)