Add VictoriaMetrics for historical metrics (Mar 13+)

- Single-node VM deployment with 200Gi NVMe, 2y retention - Traefik IngressRoute at vm.vultrlabs.dev (TLS + basic auth) - Backfill script: pulls vLLM/DCGM metrics from Mimir, writes to VM - Retain StorageClass so historical data survives PVC deletion - README with deployment + Grafana mixed-datasource instructions
2026-04-09 19:29:18 +00:00
parent 7ade5ecac8
commit bf6d62b9a8
10 changed files with 690 additions and 0 deletions
--- a/victoriametrics/00-namespace.yaml
+++ b/victoriametrics/00-namespace.yaml
@@ -0,0 +1,10 @@
 ##############################################################################
 # Namespace for VictoriaMetrics (historical metrics store)
 ##############################################################################
 apiVersion: v1
 kind: Namespace
 metadata:
  name: victoriametrics
  labels:
    app.kubernetes.io/part-of: victoriametrics
--- a/victoriametrics/01-storageclass.yaml
+++ b/victoriametrics/01-storageclass.yaml
@@ -0,0 +1,16 @@
 ##############################################################################
 # StorageClass — Vultr Block Storage CSI (for VictoriaMetrics)
 # Separate StorageClass with Retain policy so historical data isn't lost
 ##############################################################################
 apiVersion: storage.k8s.io/v1
 kind: StorageClass
 metadata:
  name: vultr-block-storage-vm
 provisioner: block.csi.vultr.com
 parameters:
  disk_type: "nvme"
  storage_type: "block"
 reclaimPolicy: Retain               # Keep the volume even if PVC is deleted
 allowVolumeExpansion: true
 volumeBindingMode: WaitForFirstConsumer
--- a/victoriametrics/02-deployment.yaml
+++ b/victoriametrics/02-deployment.yaml
@@ -0,0 +1,105 @@
 ##############################################################################
 # VictoriaMetrics Single-Node Deployment
 # Stores historical metrics from Mimir (Mar 13–present) for Grafana queries
 ##############################################################################
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: victoriametrics
  namespace: victoriametrics
  labels:
    app.kubernetes.io/name: victoriametrics
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: victoriametrics
  template:
    metadata:
      labels:
        app.kubernetes.io/name: victoriametrics
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8428"
    spec:
      securityContext:
        fsGroup: 65534
      containers:
        - name: victoriametrics
          image: victoriametrics/victoria-metrics:v1.115.0
          args:
            - "-storageDataPath=/data"
            - "-retentionPeriod=2y"              # Keep historical data for 2 years
            - "-httpListenAddr=:8428"
            - "-search.maxQueryDuration=120s"    # Long-running queries OK for historical
            - "-search.maxSamplesPerQuery=100000000"  # High limit for wide historical queries
            - "-memory.allowedBytes=4GB"          # Memory budget
            - "-search.maxUniqueTimeseries=5000000"  # Allow high cardinality
          ports:
            - name: http
              containerPort: 8428
          volumeMounts:
            - name: data
              mountPath: /data
          resources:
            requests:
              cpu: "2"
              memory: 4Gi
            limits:
              cpu: "4"
              memory: 8Gi
          livenessProbe:
            httpGet:
              path: /health
              port: http
            initialDelaySeconds: 30
            periodSeconds: 15
          readinessProbe:
            httpGet:
              path: /health
              port: http
            initialDelaySeconds: 10
            periodSeconds: 5
      volumes:
        - name: data
          persistentVolumeClaim:
            claimName: victoriametrics-data
 ---
 ##############################################################################
 # PVC — Vultr Block Storage for VictoriaMetrics data
 ##############################################################################
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: victoriametrics-data
  namespace: victoriametrics
 spec:
  storageClassName: vultr-block-storage-vm
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 200Gi
 ---
 ##############################################################################
 # Service — ClusterIP (Traefik handles external access)
 ##############################################################################
 apiVersion: v1
 kind: Service
 metadata:
  name: victoriametrics
  namespace: victoriametrics
  labels:
    app.kubernetes.io/name: victoriametrics
 spec:
  selector:
    app.kubernetes.io/name: victoriametrics
  ports:
    - name: http
      port: 8428
      targetPort: http
--- a/victoriametrics/03-ingressroute.yaml
+++ b/victoriametrics/03-ingressroute.yaml
@@ -0,0 +1,58 @@
 ##############################################################################
 # VictoriaMetrics Traefik IngressRoute
 # External: https://vm.vultrlabs.dev → Traefik → victoriametrics:8428
 ##############################################################################
 ---
 # HTTP redirect to HTTPS
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: victoriametrics-redirect
  namespace: victoriametrics
 spec:
  entryPoints:
    - web
  routes:
    - match: Host(`vm.vultrlabs.dev`)
      kind: Rule
      middlewares:
        - name: redirect-https
          namespace: victoriametrics
      services:
        - name: victoriametrics
          port: 8428
 ---
 # HTTPS with basic auth
 apiVersion: traefik.io/v1alpha1
 kind: IngressRoute
 metadata:
  name: victoriametrics
  namespace: victoriametrics
 spec:
  entryPoints:
    - websecure
  routes:
    - match: Host(`vm.vultrlabs.dev`)
      kind: Rule
      middlewares:
        - name: basic-auth
          namespace: victoriametrics
      services:
        - name: victoriametrics
          port: 8428
  tls:
    certResolver: letsencrypt
 ---
 # HTTPS redirect middleware
 apiVersion: traefik.io/v1alpha1
 kind: Middleware
 metadata:
  name: redirect-https
  namespace: victoriametrics
 spec:
  redirectScheme:
    scheme: https
    permanent: true
--- a/victoriametrics/04-basic-auth-middleware.yaml
+++ b/victoriametrics/04-basic-auth-middleware.yaml
@@ -0,0 +1,33 @@
 ##############################################################################
 # Basic Auth Middleware for VictoriaMetrics Traefik IngressRoute
 # CHANGE THE PASSWORD BEFORE PRODUCTION USE!
 #
 # To generate a new htpasswd entry:
 #   htpasswd -nb <username> <password>
 # Then base64 encode it:
 #   echo -n '<htpasswd-output>' | base64
 # Update the secret below with the new value.
 ##############################################################################
 ---
 apiVersion: v1
 kind: Secret
 metadata:
  name: basic-auth-secret
  namespace: victoriametrics
 type: Opaque
 # Generate with: htpasswd -nb vultr_vm <password> | base64
 # See .env for credentials
 stringData:
  users: |-
    vultr_vm:$apr1$ZtK5B1K4$SCWPgREqKwfcrCr4FA6En1
 ---
 apiVersion: traefik.io/v1alpha1
 kind: Middleware
 metadata:
  name: basic-auth
  namespace: victoriametrics
 spec:
  basicAuth:
    secret: basic-auth-secret
--- a/victoriametrics/05-backfill-secrets.yaml
+++ b/victoriametrics/05-backfill-secrets.yaml
@@ -0,0 +1,17 @@
 ##############################################################################
 # Secrets for backfill (Mimir credentials)
 # IMPORTANT: Update the password before running!
 #
 # To create the secret:
 #   kubectl create secret generic backfill-credentials \
 #     --from-literal=mimir-password='YOUR_PASSWORD' -n victoriametrics
 ##############################################################################
 apiVersion: v1
 kind: Secret
 metadata:
  name: backfill-credentials
  namespace: victoriametrics
 type: Opaque
 stringData:
  mimir-password: "REPLACE_WITH_MIMIR_PASSWORD"
--- a/victoriametrics/README.md
+++ b/victoriametrics/README.md
@@ -0,0 +1,185 @@
 # VictoriaMetrics — Historical Metrics Store
 VictoriaMetrics instance for querying historical vLLM + DCGM metrics (March 13, 2026 onward) that couldn't be backfilled into M3DB.
 ## Why VictoriaMetrics Instead of M3DB?
 M3DB doesn't support backfill. Period. See the [main README](../README.md#why-backfill-doesnt-work) for the full story.
 VictoriaMetrics has a first-class `/api/v1/import` endpoint that accepts data with any timestamp — no `bufferPast` gates, no block size hacks, no special namespaces. You just send the data and it works.
 ## Architecture
 ```
                 ┌─────────────────────────────────────────────────┐
                 │               Vultr VKE Cluster                 │
                 │                                                 │
 Mimir ──import──▶ VictoriaMetrics (1 pod, 200Gi NVMe)            │
                 │   ↓ PromQL queries                              │
                 │   Traefik (TLS + basic auth)                    │
                 │   ↓                                             │
                 │   vm.vultrlabs.dev                              │
                 └─────────────────────────────────────────────────┘
 Grafana queries both:
  - M3DB (m3db.vultrlabs.dev) → real-time data (1h blocks, going forward)
  - VictoriaMetrics (vm.vultrlabs.dev) → historical data (Mar 13–present)
 ```
 ## Quick Start
 ### 1. Deploy VictoriaMetrics
 ```bash
 # Apply manifests
 kubectl apply -k .
 # Wait for pod to be running
 kubectl -n victoriametrics get pods -w
 # Verify it's healthy
 kubectl -n victoriametrics port-forward svc/victoriametrics 8428:8428 &
 curl http://localhost:8428/health
 ```
 ### 2. Configure DNS
 Get the Traefik LoadBalancer IP and point `vm.vultrlabs.dev` at it:
 ```bash
 kubectl -n traefik get svc traefik
 ```
 ### 3. Set Up Basic Auth
 Generate htpasswd and update the secret in `04-basic-auth-middleware.yaml`:
 ```bash
 htpasswd -nb vultr_vm <your-password>
 # Copy output, base64 encode it:
 echo -n '<htpasswd-output>' | base64
 # Update the secret and apply
 kubectl apply -f 04-basic-auth-middleware.yaml
 ```
 ### 4. Run Backfill
 ```bash
 # Create the secret with Mimir credentials
 kubectl create secret generic backfill-credentials \
  --from-literal=mimir-password='YOUR_MIMIR_PASSWORD' -n victoriametrics
 # Upload the backfill script as a configmap
 kubectl create configmap backfill-script \
  --from-file=backfill.py=backfill.py -n victoriametrics
 # Run the backfill pod
 kubectl apply -f backfill-pod.yaml
 # Watch progress
 kubectl logs -f backfill -n victoriametrics
 # Cleanup when done
 kubectl delete pod backfill -n victoriametrics
 kubectl delete configmap backfill-script -n victoriametrics
 kubectl delete secret backfill-credentials -n victoriametrics
 ```
 ### 5. Verify
 ```bash
 # In-cluster
 kubectl -n victoriametrics exec deploy/victoriametrics -- \
  curl -s 'http://localhost:8428/api/v1/query?query=vllm:prompt_tokens_total' | python3 -m json.tool
 # External (with auth)
 curl -u vultr_vm:<password> "https://vm.vultrlabs.dev/api/v1/query?query=up"
 ```
 ## Grafana Configuration
 Add VictoriaMetrics as a **Prometheus** datasource:
 - **URL:** `https://vm.vultrlabs.dev` (with basic auth)
 - **In-cluster URL:** `http://victoriametrics.victoriametrics.svc.cluster.local:8428`
 ### Mixed Queries (M3DB + VictoriaMetrics)
 Use a **Mixed** datasource in Grafana to query both:
 1. Create two Prometheus datasources:
   - `M3DB` → `https://m3db.vultrlabs.dev`
   - `VictoriaMetrics` → `https://vm.vultrlabs.dev`
 2. Create a **Mixed** datasource that includes both
 3. In dashboards, use the mixed datasource — Grafana sends the query to both backends and merges results
 Alternatively, use dashboard variables to let users toggle between datasources for different time ranges.
 ## Metrics Stored
 | Metric | Description |
 |--------|-------------|
 | `vllm:prompt_tokens_total` | vLLM prompt token count |
 | `vllm:generation_tokens_total` | vLLM generation token count |
 | `DCGM_FI_DEV_GPU_UTIL` | GPU utilization (DCGM) |
 All metrics are tagged with `tenant=serverless-inference-cluster` and `cluster=serverless-inference-cluster`.
 ## VictoriaMetrics API Reference
 | Endpoint | Purpose |
 |----------|---------|
 | `/api/v1/import` | Import data (Prometheus format) |
 | `/api/v1/export` | Export data |
 | `/api/v1/query` | PromQL instant query |
 | `/api/v1/query_range` | PromQL range query |
 | /health | Health check |
 | /metrics | Internal metrics |
 ## Storage
 - **Size:** 200Gi NVMe (Vultr Block Storage)
 - **StorageClass:** `vultr-block-storage-vm` (Retain policy — data survives PVC deletion)
 - **Retention:** 2 years
 - **Volume expansion:** `kubectl edit pvc victoriametrics-data -n victoriametrics`
 ## Useful Commands
 ```bash
 # Check VM health
 kubectl -n victoriametrics exec deploy/victoriametrics -- curl -s http://localhost:8428/health
 # Check storage stats
 kubectl -n victoriametrics exec deploy/victoriametrics -- \
  curl -s 'http://localhost:8428/api/v1/query?query=vm_rows' | python3 -m json.tool
 # Query historical data
 curl -u vultr_vm:<password> \
  "https://vm.vultrlabs.dev/api/v1/query_range?query=vllm:prompt_tokens_total&start=1773360000&end=1742000000&step=60"
 # Restart VM (if needed)
 kubectl rollout restart deployment/victoriametrics -n victoriametrics
 # Scale to 0 (preserve data, stop the pod)
 kubectl scale deployment/victoriametrics --replicas=0 -n victoriametrics
 ```
 ## Re-running Backfill
 If you need to import additional time ranges or new metrics:
 1. Edit `backfill.py` — update `START_TS`, `END_TS`, or `METRICS`
 2. Recreate the configmap and pod (see step 4 above)
 3. VictoriaMetrics is idempotent for imports — duplicate data points are merged, not duplicated
 To convert timestamps:
 ```bash
 # Date → Unix timestamp
 date -u -d '2026-03-13 00:00:00' +%s    # 1773360000
 # Unix timestamp → date
 date -u -d @1773360000
 ```
--- a/victoriametrics/backfill-pod.yaml
+++ b/victoriametrics/backfill-pod.yaml
@@ -0,0 +1,46 @@
 ##############################################################################
 # Backfill Pod — One-shot job to import historical metrics from Mimir
 #
 # Usage:
 #   kubectl create configmap backfill-script \
 #     --from-file=backfill.py=backfill.py -n victoriametrics
 #   kubectl apply -f backfill-pod.yaml
 #   kubectl logs -f backfill -n victoriametrics
 #
 # Cleanup:
 #   kubectl delete pod backfill -n victoriametrics
 #   kubectl delete configmap backfill-script -n victoriametrics
 ##############################################################################
 apiVersion: v1
 kind: Pod
 metadata:
  name: backfill
  namespace: victoriametrics
 spec:
  restartPolicy: Never
  containers:
    - name: backfill
      image: python:3.12-slim
      command: ["python3", "/scripts/backfill.py"]
      env:
        - name: MIMIR_USERNAME
          value: "vultr_sea_inference"
        - name: MIMIR_PASSWORD
          valueFrom:
            secretKeyRef:
              name: backfill-credentials
              key: mimir-password
        - name: VM_URL
          value: "http://victoriametrics.victoriametrics.svc.cluster.local:8428"
        - name: START_TS
          value: "1773360000"    # 2026-03-13T00:00:00Z
        - name: CHUNK_HOURS
          value: "6"
      volumeMounts:
        - name: script
          mountPath: /scripts
  volumes:
    - name: script
      configMap:
        name: backfill-script
--- a/victoriametrics/backfill.py
+++ b/victoriametrics/backfill.py
@@ -0,0 +1,211 @@
 #!/usr/bin/env python3
 """
 Backfill historical metrics from Mimir to VictoriaMetrics.
 Uses VictoriaMetrics /api/v1/import endpoint which happily accepts
 data with any timestamp — no bufferPast gates, no block size hacks.
 Usage:
  # Run in-cluster (as a pod, see backfill-pod.yaml)
  python3 backfill.py
  # Or locally with port-forward
  kubectl port-forward -n victoriametrics svc/victoriametrics 8428:8428
  VM_URL=http://localhost:8428 python3 backfill.py
 """
 import urllib.request
 import urllib.error
 import urllib.parse
 import json
 import ssl
 import os
 import time
 import base64
 import sys
 # ── Configuration ──────────────────────────────────────────────────
 MIMIR_URL = os.environ.get("MIMIR_URL", "https://metrics.vultrlabs.com/prometheus")
 MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
 MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
 VM_URL = os.environ.get("VM_URL", "http://victoriametrics.victoriametrics.svc.cluster.local:8428")
 # Time range: March 13, 2026 00:00:00 UTC → now
 START_TS = int(os.environ.get("START_TS", "1773360000"))   # 2026-03-13T00:00:00Z
 END_TS = int(os.environ.get("END_TS", str(int(time.time()))))
 STEP = os.environ.get("STEP", "10s")
 CHUNK_HOURS = int(os.environ.get("CHUNK_HOURS", "6"))
 # Metrics to backfill
 METRICS = [
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
    "DCGM_FI_DEV_GPU_UTIL",
 ]
 # Extra labels to add to all imported data (e.g. tenant/cluster context)
 EXTRA_LABELS = {
    "tenant": "serverless-inference-cluster",
    "cluster": "serverless-inference-cluster",
 }
 # ── Helpers ────────────────────────────────────────────────────────
 def ssl_ctx():
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    return ctx
 def mimir_query(path):
    """Query Mimir API with basic auth."""
    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
    req = urllib.request.Request(f"{MIMIR_URL}{path}")
    req.add_header("Authorization", f"Basic {auth}")
    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
    return json.loads(resp.read().decode())
 def vm_import(lines):
    """Push data to VictoriaMetrics /api/v1/import."""
    data = "\n".join(lines).encode("utf-8")
    req = urllib.request.Request(
        f"{VM_URL}/api/v1/import",
        data=data,
        method="POST",
    )
    req.add_header("Content-Type", "application/octet-stream")
    try:
        resp = urllib.request.urlopen(req, timeout=300)
        return True
    except urllib.error.HTTPError as e:
        body = e.read().decode()[:200]
        print(f"  VM import ERROR {e.code}: {body}", flush=True)
        return False
 def format_prom_metric_name(raw_name):
    """Convert Mimir metric name to valid Prometheus metric name for VM.
    VictoriaMetrics import format uses: metric_name{label1="val1",...} timestamp value
    Colons in metric names are valid in Prometheus but we keep them as-is since
    VM handles them fine.
    """
    return raw_name
 # ── Main ───────────────────────────────────────────────────────────
 print(f"VictoriaMetrics Backfill", flush=True)
 print(f"========================", flush=True)
 print(f"Source:  {MIMIR_URL}", flush=True)
 print(f"Target:  {VM_URL}", flush=True)
 print(f"Range:   {START_TS} → {END_TS} ({CHUNK_HOURS}h chunks)", flush=True)
 print(f"Metrics: {', '.join(METRICS)}", flush=True)
 print(f"Extra labels: {EXTRA_LABELS}", flush=True)
 print(flush=True)
 total_samples = 0
 total_errors = 0
 for metric in METRICS:
    print(f"\n{'='*60}", flush=True)
    print(f"Metric: {metric}", flush=True)
    print(f"{'='*60}", flush=True)
    metric_samples = 0
    chunk_start = START_TS
    while chunk_start < END_TS:
        chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
        chunk_label = f"[{time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_start))} → {time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_end))}]"
        print(f"  {chunk_label} ...", end="", flush=True)
        try:
            path = (
                f"/api/v1/query_range?"
                f"query={urllib.parse.quote(metric)}"
                f"&start={chunk_start}&end={chunk_end}&step={STEP}"
            )
            data = mimir_query(path)
            if data.get("status") != "success":
                print(f" Mimir returned status={data.get('status')}", flush=True)
                chunk_start = chunk_end
                continue
            series_list = data["data"]["result"]
            if not series_list:
                print(f" no data", flush=True)
                chunk_start = chunk_end
                continue
            # Build import lines in VictoriaMetrics native format
            # Format: metric_name{label1="val1",label2="val2"} timestamp value
            import_lines = []
            chunk_count = 0
            for series in series_list:
                labels = dict(series["metric"])
                # Remove __name__ from labels (it's the metric name)
                metric_name = labels.pop("__name__", metric)
                # Add extra labels
                labels.update(EXTRA_LABELS)
                # Build label string
                label_parts = [f'{k}="{v}"' for k, v in sorted(labels.items())]
                label_str = ",".join(label_parts)
                # Build import lines: one per sample
                for ts_str, val_str in series["values"]:
                    # Convert timestamp (seconds) to ms for VM
                    ts_ms = int(float(ts_str) * 1000)
                    try:
                        val = float(val_str)
                    except (ValueError, TypeError):
                        # Handle +Inf, -Inf, NaN
                        if val_str == "+Inf":
                            val = float("inf")
                        elif val_str == "-Inf":
                            val = float("-inf")
                        else:
                            continue
                    import_lines.append(f'{metric_name}{{{label_str}}} {ts_ms} {val_str}')
                    chunk_count += 1
            if import_lines:
                ok = vm_import(import_lines)
                if ok:
                    print(f" {chunk_count} samples imported", flush=True)
                    metric_samples += chunk_count
                else:
                    print(f" IMPORT FAILED ({chunk_count} samples lost)", flush=True)
                    total_errors += chunk_count
            else:
                print(f" 0 samples", flush=True)
        except Exception as e:
            print(f" ERROR: {e}", flush=True)
            total_errors += 1
        chunk_start = chunk_end
    print(f"  Total for {metric}: {metric_samples} samples", flush=True)
    total_samples += metric_samples
 print(f"\n{'='*60}", flush=True)
 print(f"BACKFILL COMPLETE", flush=True)
 print(f"Total samples imported: {total_samples}", flush=True)
 print(f"Total errors: {total_errors}", flush=True)
 print(f"{'='*60}", flush=True)
 # Verify by querying VM
 print(f"\nVerifying import...", flush=True)
 try:
    verify_path = f"/api/v1/query?query={urllib.parse.quote('count(up)')}"
    req = urllib.request.Request(f"{VM_URL}{verify_path}")
    resp = urllib.request.urlopen(req, timeout=30)
    print(f"VM is responding to queries ✓", flush=True)
 except Exception as e:
    print(f"VM query check failed: {e}", flush=True)
--- a/victoriametrics/kustomization.yaml
+++ b/victoriametrics/kustomization.yaml
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - 00-namespace.yaml
  - 01-storageclass.yaml
  - 02-deployment.yaml
  - 03-ingressroute.yaml
  - 04-basic-auth-middleware.yaml