diff --git a/victoriametrics/00-namespace.yaml b/victoriametrics/00-namespace.yaml new file mode 100644 index 0000000..4199254 --- /dev/null +++ b/victoriametrics/00-namespace.yaml @@ -0,0 +1,10 @@ +############################################################################## +# Namespace for VictoriaMetrics (historical metrics store) +############################################################################## + +apiVersion: v1 +kind: Namespace +metadata: + name: victoriametrics + labels: + app.kubernetes.io/part-of: victoriametrics diff --git a/victoriametrics/01-storageclass.yaml b/victoriametrics/01-storageclass.yaml new file mode 100644 index 0000000..76d0dec --- /dev/null +++ b/victoriametrics/01-storageclass.yaml @@ -0,0 +1,16 @@ +############################################################################## +# StorageClass — Vultr Block Storage CSI (for VictoriaMetrics) +# Separate StorageClass with Retain policy so historical data isn't lost +############################################################################## + +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: vultr-block-storage-vm +provisioner: block.csi.vultr.com +parameters: + disk_type: "nvme" + storage_type: "block" +reclaimPolicy: Retain # Keep the volume even if PVC is deleted +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer diff --git a/victoriametrics/02-deployment.yaml b/victoriametrics/02-deployment.yaml new file mode 100644 index 0000000..a169e3b --- /dev/null +++ b/victoriametrics/02-deployment.yaml @@ -0,0 +1,105 @@ +############################################################################## +# VictoriaMetrics Single-Node Deployment +# Stores historical metrics from Mimir (Mar 13–present) for Grafana queries +############################################################################## + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: victoriametrics + namespace: victoriametrics + labels: + app.kubernetes.io/name: victoriametrics +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: victoriametrics + template: + metadata: + labels: + app.kubernetes.io/name: victoriametrics + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8428" + spec: + securityContext: + fsGroup: 65534 + containers: + - name: victoriametrics + image: victoriametrics/victoria-metrics:v1.115.0 + args: + - "-storageDataPath=/data" + - "-retentionPeriod=2y" # Keep historical data for 2 years + - "-httpListenAddr=:8428" + - "-search.maxQueryDuration=120s" # Long-running queries OK for historical + - "-search.maxSamplesPerQuery=100000000" # High limit for wide historical queries + - "-memory.allowedBytes=4GB" # Memory budget + - "-search.maxUniqueTimeseries=5000000" # Allow high cardinality + ports: + - name: http + containerPort: 8428 + volumeMounts: + - name: data + mountPath: /data + resources: + requests: + cpu: "2" + memory: 4Gi + limits: + cpu: "4" + memory: 8Gi + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: data + persistentVolumeClaim: + claimName: victoriametrics-data + +--- +############################################################################## +# PVC — Vultr Block Storage for VictoriaMetrics data +############################################################################## + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: victoriametrics-data + namespace: victoriametrics +spec: + storageClassName: vultr-block-storage-vm + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 200Gi + +--- +############################################################################## +# Service — ClusterIP (Traefik handles external access) +############################################################################## + +apiVersion: v1 +kind: Service +metadata: + name: victoriametrics + namespace: victoriametrics + labels: + app.kubernetes.io/name: victoriametrics +spec: + selector: + app.kubernetes.io/name: victoriametrics + ports: + - name: http + port: 8428 + targetPort: http diff --git a/victoriametrics/03-ingressroute.yaml b/victoriametrics/03-ingressroute.yaml new file mode 100644 index 0000000..ff9e1cf --- /dev/null +++ b/victoriametrics/03-ingressroute.yaml @@ -0,0 +1,58 @@ +############################################################################## +# VictoriaMetrics Traefik IngressRoute +# External: https://vm.vultrlabs.dev → Traefik → victoriametrics:8428 +############################################################################## + +--- +# HTTP redirect to HTTPS +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: victoriametrics-redirect + namespace: victoriametrics +spec: + entryPoints: + - web + routes: + - match: Host(`vm.vultrlabs.dev`) + kind: Rule + middlewares: + - name: redirect-https + namespace: victoriametrics + services: + - name: victoriametrics + port: 8428 + +--- +# HTTPS with basic auth +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: victoriametrics + namespace: victoriametrics +spec: + entryPoints: + - websecure + routes: + - match: Host(`vm.vultrlabs.dev`) + kind: Rule + middlewares: + - name: basic-auth + namespace: victoriametrics + services: + - name: victoriametrics + port: 8428 + tls: + certResolver: letsencrypt + +--- +# HTTPS redirect middleware +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: redirect-https + namespace: victoriametrics +spec: + redirectScheme: + scheme: https + permanent: true diff --git a/victoriametrics/04-basic-auth-middleware.yaml b/victoriametrics/04-basic-auth-middleware.yaml new file mode 100644 index 0000000..2b8a5b3 --- /dev/null +++ b/victoriametrics/04-basic-auth-middleware.yaml @@ -0,0 +1,33 @@ +############################################################################## +# Basic Auth Middleware for VictoriaMetrics Traefik IngressRoute +# CHANGE THE PASSWORD BEFORE PRODUCTION USE! +# +# To generate a new htpasswd entry: +# htpasswd -nb +# Then base64 encode it: +# echo -n '' | base64 +# Update the secret below with the new value. +############################################################################## + +--- +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth-secret + namespace: victoriametrics +type: Opaque +# Generate with: htpasswd -nb vultr_vm | base64 +# See .env for credentials +stringData: + users: |- + vultr_vm:$apr1$ZtK5B1K4$SCWPgREqKwfcrCr4FA6En1 + +--- +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: basic-auth + namespace: victoriametrics +spec: + basicAuth: + secret: basic-auth-secret diff --git a/victoriametrics/05-backfill-secrets.yaml b/victoriametrics/05-backfill-secrets.yaml new file mode 100644 index 0000000..cde1791 --- /dev/null +++ b/victoriametrics/05-backfill-secrets.yaml @@ -0,0 +1,17 @@ +############################################################################## +# Secrets for backfill (Mimir credentials) +# IMPORTANT: Update the password before running! +# +# To create the secret: +# kubectl create secret generic backfill-credentials \ +# --from-literal=mimir-password='YOUR_PASSWORD' -n victoriametrics +############################################################################## + +apiVersion: v1 +kind: Secret +metadata: + name: backfill-credentials + namespace: victoriametrics +type: Opaque +stringData: + mimir-password: "REPLACE_WITH_MIMIR_PASSWORD" diff --git a/victoriametrics/README.md b/victoriametrics/README.md new file mode 100644 index 0000000..30ccdae --- /dev/null +++ b/victoriametrics/README.md @@ -0,0 +1,185 @@ +# VictoriaMetrics — Historical Metrics Store + +VictoriaMetrics instance for querying historical vLLM + DCGM metrics (March 13, 2026 onward) that couldn't be backfilled into M3DB. + +## Why VictoriaMetrics Instead of M3DB? + +M3DB doesn't support backfill. Period. See the [main README](../README.md#why-backfill-doesnt-work) for the full story. + +VictoriaMetrics has a first-class `/api/v1/import` endpoint that accepts data with any timestamp — no `bufferPast` gates, no block size hacks, no special namespaces. You just send the data and it works. + +## Architecture + +``` + ┌─────────────────────────────────────────────────┐ + │ Vultr VKE Cluster │ + │ │ +Mimir ──import──▶ VictoriaMetrics (1 pod, 200Gi NVMe) │ + │ ↓ PromQL queries │ + │ Traefik (TLS + basic auth) │ + │ ↓ │ + │ vm.vultrlabs.dev │ + └─────────────────────────────────────────────────┘ + +Grafana queries both: + - M3DB (m3db.vultrlabs.dev) → real-time data (1h blocks, going forward) + - VictoriaMetrics (vm.vultrlabs.dev) → historical data (Mar 13–present) +``` + +## Quick Start + +### 1. Deploy VictoriaMetrics + +```bash +# Apply manifests +kubectl apply -k . + +# Wait for pod to be running +kubectl -n victoriametrics get pods -w + +# Verify it's healthy +kubectl -n victoriametrics port-forward svc/victoriametrics 8428:8428 & +curl http://localhost:8428/health +``` + +### 2. Configure DNS + +Get the Traefik LoadBalancer IP and point `vm.vultrlabs.dev` at it: + +```bash +kubectl -n traefik get svc traefik +``` + +### 3. Set Up Basic Auth + +Generate htpasswd and update the secret in `04-basic-auth-middleware.yaml`: + +```bash +htpasswd -nb vultr_vm +# Copy output, base64 encode it: +echo -n '' | base64 +# Update the secret and apply +kubectl apply -f 04-basic-auth-middleware.yaml +``` + +### 4. Run Backfill + +```bash +# Create the secret with Mimir credentials +kubectl create secret generic backfill-credentials \ + --from-literal=mimir-password='YOUR_MIMIR_PASSWORD' -n victoriametrics + +# Upload the backfill script as a configmap +kubectl create configmap backfill-script \ + --from-file=backfill.py=backfill.py -n victoriametrics + +# Run the backfill pod +kubectl apply -f backfill-pod.yaml + +# Watch progress +kubectl logs -f backfill -n victoriametrics + +# Cleanup when done +kubectl delete pod backfill -n victoriametrics +kubectl delete configmap backfill-script -n victoriametrics +kubectl delete secret backfill-credentials -n victoriametrics +``` + +### 5. Verify + +```bash +# In-cluster +kubectl -n victoriametrics exec deploy/victoriametrics -- \ + curl -s 'http://localhost:8428/api/v1/query?query=vllm:prompt_tokens_total' | python3 -m json.tool + +# External (with auth) +curl -u vultr_vm: "https://vm.vultrlabs.dev/api/v1/query?query=up" +``` + +## Grafana Configuration + +Add VictoriaMetrics as a **Prometheus** datasource: + +- **URL:** `https://vm.vultrlabs.dev` (with basic auth) +- **In-cluster URL:** `http://victoriametrics.victoriametrics.svc.cluster.local:8428` + +### Mixed Queries (M3DB + VictoriaMetrics) + +Use a **Mixed** datasource in Grafana to query both: + +1. Create two Prometheus datasources: + - `M3DB` → `https://m3db.vultrlabs.dev` + - `VictoriaMetrics` → `https://vm.vultrlabs.dev` + +2. Create a **Mixed** datasource that includes both + +3. In dashboards, use the mixed datasource — Grafana sends the query to both backends and merges results + +Alternatively, use dashboard variables to let users toggle between datasources for different time ranges. + +## Metrics Stored + +| Metric | Description | +|--------|-------------| +| `vllm:prompt_tokens_total` | vLLM prompt token count | +| `vllm:generation_tokens_total` | vLLM generation token count | +| `DCGM_FI_DEV_GPU_UTIL` | GPU utilization (DCGM) | + +All metrics are tagged with `tenant=serverless-inference-cluster` and `cluster=serverless-inference-cluster`. + +## VictoriaMetrics API Reference + +| Endpoint | Purpose | +|----------|---------| +| `/api/v1/import` | Import data (Prometheus format) | +| `/api/v1/export` | Export data | +| `/api/v1/query` | PromQL instant query | +| `/api/v1/query_range` | PromQL range query | +| /health | Health check | +| /metrics | Internal metrics | + +## Storage + +- **Size:** 200Gi NVMe (Vultr Block Storage) +- **StorageClass:** `vultr-block-storage-vm` (Retain policy — data survives PVC deletion) +- **Retention:** 2 years +- **Volume expansion:** `kubectl edit pvc victoriametrics-data -n victoriametrics` + +## Useful Commands + +```bash +# Check VM health +kubectl -n victoriametrics exec deploy/victoriametrics -- curl -s http://localhost:8428/health + +# Check storage stats +kubectl -n victoriametrics exec deploy/victoriametrics -- \ + curl -s 'http://localhost:8428/api/v1/query?query=vm_rows' | python3 -m json.tool + +# Query historical data +curl -u vultr_vm: \ + "https://vm.vultrlabs.dev/api/v1/query_range?query=vllm:prompt_tokens_total&start=1773360000&end=1742000000&step=60" + +# Restart VM (if needed) +kubectl rollout restart deployment/victoriametrics -n victoriametrics + +# Scale to 0 (preserve data, stop the pod) +kubectl scale deployment/victoriametrics --replicas=0 -n victoriametrics +``` + +## Re-running Backfill + +If you need to import additional time ranges or new metrics: + +1. Edit `backfill.py` — update `START_TS`, `END_TS`, or `METRICS` +2. Recreate the configmap and pod (see step 4 above) +3. VictoriaMetrics is idempotent for imports — duplicate data points are merged, not duplicated + +To convert timestamps: + +```bash +# Date → Unix timestamp +date -u -d '2026-03-13 00:00:00' +%s # 1773360000 + +# Unix timestamp → date +date -u -d @1773360000 +``` diff --git a/victoriametrics/backfill-pod.yaml b/victoriametrics/backfill-pod.yaml new file mode 100644 index 0000000..3c99d0c --- /dev/null +++ b/victoriametrics/backfill-pod.yaml @@ -0,0 +1,46 @@ +############################################################################## +# Backfill Pod — One-shot job to import historical metrics from Mimir +# +# Usage: +# kubectl create configmap backfill-script \ +# --from-file=backfill.py=backfill.py -n victoriametrics +# kubectl apply -f backfill-pod.yaml +# kubectl logs -f backfill -n victoriametrics +# +# Cleanup: +# kubectl delete pod backfill -n victoriametrics +# kubectl delete configmap backfill-script -n victoriametrics +############################################################################## + +apiVersion: v1 +kind: Pod +metadata: + name: backfill + namespace: victoriametrics +spec: + restartPolicy: Never + containers: + - name: backfill + image: python:3.12-slim + command: ["python3", "/scripts/backfill.py"] + env: + - name: MIMIR_USERNAME + value: "vultr_sea_inference" + - name: MIMIR_PASSWORD + valueFrom: + secretKeyRef: + name: backfill-credentials + key: mimir-password + - name: VM_URL + value: "http://victoriametrics.victoriametrics.svc.cluster.local:8428" + - name: START_TS + value: "1773360000" # 2026-03-13T00:00:00Z + - name: CHUNK_HOURS + value: "6" + volumeMounts: + - name: script + mountPath: /scripts + volumes: + - name: script + configMap: + name: backfill-script diff --git a/victoriametrics/backfill.py b/victoriametrics/backfill.py new file mode 100644 index 0000000..c958c4b --- /dev/null +++ b/victoriametrics/backfill.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +Backfill historical metrics from Mimir to VictoriaMetrics. + +Uses VictoriaMetrics /api/v1/import endpoint which happily accepts +data with any timestamp — no bufferPast gates, no block size hacks. + +Usage: + # Run in-cluster (as a pod, see backfill-pod.yaml) + python3 backfill.py + + # Or locally with port-forward + kubectl port-forward -n victoriametrics svc/victoriametrics 8428:8428 + VM_URL=http://localhost:8428 python3 backfill.py +""" + +import urllib.request +import urllib.error +import urllib.parse +import json +import ssl +import os +import time +import base64 +import sys + +# ── Configuration ────────────────────────────────────────────────── + +MIMIR_URL = os.environ.get("MIMIR_URL", "https://metrics.vultrlabs.com/prometheus") +MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME") +MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD") + +VM_URL = os.environ.get("VM_URL", "http://victoriametrics.victoriametrics.svc.cluster.local:8428") + +# Time range: March 13, 2026 00:00:00 UTC → now +START_TS = int(os.environ.get("START_TS", "1773360000")) # 2026-03-13T00:00:00Z +END_TS = int(os.environ.get("END_TS", str(int(time.time())))) + +STEP = os.environ.get("STEP", "10s") +CHUNK_HOURS = int(os.environ.get("CHUNK_HOURS", "6")) + +# Metrics to backfill +METRICS = [ + "vllm:prompt_tokens_total", + "vllm:generation_tokens_total", + "DCGM_FI_DEV_GPU_UTIL", +] + +# Extra labels to add to all imported data (e.g. tenant/cluster context) +EXTRA_LABELS = { + "tenant": "serverless-inference-cluster", + "cluster": "serverless-inference-cluster", +} + +# ── Helpers ──────────────────────────────────────────────────────── + +def ssl_ctx(): + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + return ctx + +def mimir_query(path): + """Query Mimir API with basic auth.""" + auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode() + req = urllib.request.Request(f"{MIMIR_URL}{path}") + req.add_header("Authorization", f"Basic {auth}") + resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300) + return json.loads(resp.read().decode()) + +def vm_import(lines): + """Push data to VictoriaMetrics /api/v1/import.""" + data = "\n".join(lines).encode("utf-8") + req = urllib.request.Request( + f"{VM_URL}/api/v1/import", + data=data, + method="POST", + ) + req.add_header("Content-Type", "application/octet-stream") + try: + resp = urllib.request.urlopen(req, timeout=300) + return True + except urllib.error.HTTPError as e: + body = e.read().decode()[:200] + print(f" VM import ERROR {e.code}: {body}", flush=True) + return False + +def format_prom_metric_name(raw_name): + """Convert Mimir metric name to valid Prometheus metric name for VM. + + VictoriaMetrics import format uses: metric_name{label1="val1",...} timestamp value + Colons in metric names are valid in Prometheus but we keep them as-is since + VM handles them fine. + """ + return raw_name + +# ── Main ─────────────────────────────────────────────────────────── + +print(f"VictoriaMetrics Backfill", flush=True) +print(f"========================", flush=True) +print(f"Source: {MIMIR_URL}", flush=True) +print(f"Target: {VM_URL}", flush=True) +print(f"Range: {START_TS} → {END_TS} ({CHUNK_HOURS}h chunks)", flush=True) +print(f"Metrics: {', '.join(METRICS)}", flush=True) +print(f"Extra labels: {EXTRA_LABELS}", flush=True) +print(flush=True) + +total_samples = 0 +total_errors = 0 + +for metric in METRICS: + print(f"\n{'='*60}", flush=True) + print(f"Metric: {metric}", flush=True) + print(f"{'='*60}", flush=True) + + metric_samples = 0 + chunk_start = START_TS + + while chunk_start < END_TS: + chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS) + chunk_label = f"[{time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_start))} → {time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_end))}]" + print(f" {chunk_label} ...", end="", flush=True) + + try: + path = ( + f"/api/v1/query_range?" + f"query={urllib.parse.quote(metric)}" + f"&start={chunk_start}&end={chunk_end}&step={STEP}" + ) + data = mimir_query(path) + + if data.get("status") != "success": + print(f" Mimir returned status={data.get('status')}", flush=True) + chunk_start = chunk_end + continue + + series_list = data["data"]["result"] + if not series_list: + print(f" no data", flush=True) + chunk_start = chunk_end + continue + + # Build import lines in VictoriaMetrics native format + # Format: metric_name{label1="val1",label2="val2"} timestamp value + import_lines = [] + chunk_count = 0 + + for series in series_list: + labels = dict(series["metric"]) + # Remove __name__ from labels (it's the metric name) + metric_name = labels.pop("__name__", metric) + + # Add extra labels + labels.update(EXTRA_LABELS) + + # Build label string + label_parts = [f'{k}="{v}"' for k, v in sorted(labels.items())] + label_str = ",".join(label_parts) + + # Build import lines: one per sample + for ts_str, val_str in series["values"]: + # Convert timestamp (seconds) to ms for VM + ts_ms = int(float(ts_str) * 1000) + try: + val = float(val_str) + except (ValueError, TypeError): + # Handle +Inf, -Inf, NaN + if val_str == "+Inf": + val = float("inf") + elif val_str == "-Inf": + val = float("-inf") + else: + continue + import_lines.append(f'{metric_name}{{{label_str}}} {ts_ms} {val_str}') + chunk_count += 1 + + if import_lines: + ok = vm_import(import_lines) + if ok: + print(f" {chunk_count} samples imported", flush=True) + metric_samples += chunk_count + else: + print(f" IMPORT FAILED ({chunk_count} samples lost)", flush=True) + total_errors += chunk_count + else: + print(f" 0 samples", flush=True) + + except Exception as e: + print(f" ERROR: {e}", flush=True) + total_errors += 1 + + chunk_start = chunk_end + + print(f" Total for {metric}: {metric_samples} samples", flush=True) + total_samples += metric_samples + +print(f"\n{'='*60}", flush=True) +print(f"BACKFILL COMPLETE", flush=True) +print(f"Total samples imported: {total_samples}", flush=True) +print(f"Total errors: {total_errors}", flush=True) +print(f"{'='*60}", flush=True) + +# Verify by querying VM +print(f"\nVerifying import...", flush=True) +try: + verify_path = f"/api/v1/query?query={urllib.parse.quote('count(up)')}" + req = urllib.request.Request(f"{VM_URL}{verify_path}") + resp = urllib.request.urlopen(req, timeout=30) + print(f"VM is responding to queries ✓", flush=True) +except Exception as e: + print(f"VM query check failed: {e}", flush=True) diff --git a/victoriametrics/kustomization.yaml b/victoriametrics/kustomization.yaml new file mode 100644 index 0000000..1963d49 --- /dev/null +++ b/victoriametrics/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - 00-namespace.yaml + - 01-storageclass.yaml + - 02-deployment.yaml + - 03-ingressroute.yaml + - 04-basic-auth-middleware.yaml