scale-gke-qdrant-llama/deploy-gke.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-gke-deploy
spec:
  replicas: 2
  selector:
    matchLabels:
      app: llama-gke-pod
  template:
    metadata:
      labels:
        app: llama-gke-pod
    spec:
      containers:
      - name: llama-gke-container
        image: gcr.io/PROJECT_ID/IMAGE:TAG
        ports:
        - containerPort: 8000     # Port inside the container where the FastAPI app is running
        env:
        - name: OPENAI_API_KEY
          valueFrom:
            secretKeyRef:
              name: openai-secret
              key: OPENAI_API_KEY
        - name: QDRANT_API_KEY
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: QDRANT_API_KEY
        - name: QDRANT_URL
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: QDRANT_URL
        - name: COLLECTION_NAME
          valueFrom:
            secretKeyRef:
              name: qdrant-secret
              key: COLLECTION_NAME
        resources:
          requests:               # Minimum resources required.
            memory: "2Gi"
            cpu: "1"
          limits:                 # Maximum resources allowed
            memory: "4Gi"
            cpu: "2"
        readinessProbe:           # Check if the pod is ready to serve traffic.
          httpGet:
            scheme: HTTP
            path: /
            port: 8000              # Port for readiness probe (should match containerPort)
          initialDelaySeconds: 240  # Delay before first probe is executed
          periodSeconds: 60         # Interval between probes
        livenessProbe:              # Check if the pod is alive
          httpGet:
            scheme: HTTP
            path: /
            port: 8000              # Port for liveness probe (should match containerPort)
          initialDelaySeconds: 240  # Delay before first probe is executed
          periodSeconds: 60         # Interval between probes


---
apiVersion: v1
kind: Service
metadata:
  name: mylb
spec:
  type: LoadBalancer
  selector:
    app: llama-gke-pod
  ports:
  - port: 8000                    # Port exposed by the Kubernetes service (ccould be 80 as well)
    targetPort: 8000              # Port where the service forwards traffic to (should match containerPort)

# Vertical scaling
---
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: llama-gke-deploy-vpa
spec:
  targetRef:
    apiVersion: "apps/v1"
    kind:       Deployment
    name:       llama-gke-deploy
  updatePolicy:                   # Policy for updating the resource requests and limits
    updateMode: "Auto"            # Automatically update the resource requests and limits

# Horizontal scaling
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: llama-gke-deploy-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llama-gke-deploy
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource                # Type of metric
    resource:                     # Resource-based metric
      name: cpu                   # Metric name
      target:
        type: Utilization         # Type of target value
        averageUtilization: 70    # Average CPU utilization percentage to maintain.