Add VictoriaMetrics for historical metrics (Mar 13+)
- Single-node VM deployment with 200Gi NVMe, 2y retention - Traefik IngressRoute at vm.vultrlabs.dev (TLS + basic auth) - Backfill script: pulls vLLM/DCGM metrics from Mimir, writes to VM - Retain StorageClass so historical data survives PVC deletion - README with deployment + Grafana mixed-datasource instructions
This commit is contained in:
10
victoriametrics/00-namespace.yaml
Normal file
10
victoriametrics/00-namespace.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Namespace for VictoriaMetrics (historical metrics store)
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: victoriametrics
|
||||||
16
victoriametrics/01-storageclass.yaml
Normal file
16
victoriametrics/01-storageclass.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
##############################################################################
|
||||||
|
# StorageClass — Vultr Block Storage CSI (for VictoriaMetrics)
|
||||||
|
# Separate StorageClass with Retain policy so historical data isn't lost
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
name: vultr-block-storage-vm
|
||||||
|
provisioner: block.csi.vultr.com
|
||||||
|
parameters:
|
||||||
|
disk_type: "nvme"
|
||||||
|
storage_type: "block"
|
||||||
|
reclaimPolicy: Retain # Keep the volume even if PVC is deleted
|
||||||
|
allowVolumeExpansion: true
|
||||||
|
volumeBindingMode: WaitForFirstConsumer
|
||||||
105
victoriametrics/02-deployment.yaml
Normal file
105
victoriametrics/02-deployment.yaml
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
##############################################################################
|
||||||
|
# VictoriaMetrics Single-Node Deployment
|
||||||
|
# Stores historical metrics from Mimir (Mar 13–present) for Grafana queries
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics
|
||||||
|
namespace: victoriametrics
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: victoriametrics
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: victoriametrics
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: victoriametrics
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8428"
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 65534
|
||||||
|
containers:
|
||||||
|
- name: victoriametrics
|
||||||
|
image: victoriametrics/victoria-metrics:v1.115.0
|
||||||
|
args:
|
||||||
|
- "-storageDataPath=/data"
|
||||||
|
- "-retentionPeriod=2y" # Keep historical data for 2 years
|
||||||
|
- "-httpListenAddr=:8428"
|
||||||
|
- "-search.maxQueryDuration=120s" # Long-running queries OK for historical
|
||||||
|
- "-search.maxSamplesPerQuery=100000000" # High limit for wide historical queries
|
||||||
|
- "-memory.allowedBytes=4GB" # Memory budget
|
||||||
|
- "-search.maxUniqueTimeseries=5000000" # Allow high cardinality
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 8428
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /data
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "2"
|
||||||
|
memory: 4Gi
|
||||||
|
limits:
|
||||||
|
cpu: "4"
|
||||||
|
memory: 8Gi
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 15
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 5
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: victoriametrics-data
|
||||||
|
|
||||||
|
---
|
||||||
|
##############################################################################
|
||||||
|
# PVC — Vultr Block Storage for VictoriaMetrics data
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics-data
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
storageClassName: vultr-block-storage-vm
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 200Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
##############################################################################
|
||||||
|
# Service — ClusterIP (Traefik handles external access)
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics
|
||||||
|
namespace: victoriametrics
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: victoriametrics
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: victoriametrics
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 8428
|
||||||
|
targetPort: http
|
||||||
58
victoriametrics/03-ingressroute.yaml
Normal file
58
victoriametrics/03-ingressroute.yaml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
##############################################################################
|
||||||
|
# VictoriaMetrics Traefik IngressRoute
|
||||||
|
# External: https://vm.vultrlabs.dev → Traefik → victoriametrics:8428
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
---
|
||||||
|
# HTTP redirect to HTTPS
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics-redirect
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- web
|
||||||
|
routes:
|
||||||
|
- match: Host(`vm.vultrlabs.dev`)
|
||||||
|
kind: Rule
|
||||||
|
middlewares:
|
||||||
|
- name: redirect-https
|
||||||
|
namespace: victoriametrics
|
||||||
|
services:
|
||||||
|
- name: victoriametrics
|
||||||
|
port: 8428
|
||||||
|
|
||||||
|
---
|
||||||
|
# HTTPS with basic auth
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: IngressRoute
|
||||||
|
metadata:
|
||||||
|
name: victoriametrics
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
routes:
|
||||||
|
- match: Host(`vm.vultrlabs.dev`)
|
||||||
|
kind: Rule
|
||||||
|
middlewares:
|
||||||
|
- name: basic-auth
|
||||||
|
namespace: victoriametrics
|
||||||
|
services:
|
||||||
|
- name: victoriametrics
|
||||||
|
port: 8428
|
||||||
|
tls:
|
||||||
|
certResolver: letsencrypt
|
||||||
|
|
||||||
|
---
|
||||||
|
# HTTPS redirect middleware
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: Middleware
|
||||||
|
metadata:
|
||||||
|
name: redirect-https
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
redirectScheme:
|
||||||
|
scheme: https
|
||||||
|
permanent: true
|
||||||
33
victoriametrics/04-basic-auth-middleware.yaml
Normal file
33
victoriametrics/04-basic-auth-middleware.yaml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Basic Auth Middleware for VictoriaMetrics Traefik IngressRoute
|
||||||
|
# CHANGE THE PASSWORD BEFORE PRODUCTION USE!
|
||||||
|
#
|
||||||
|
# To generate a new htpasswd entry:
|
||||||
|
# htpasswd -nb <username> <password>
|
||||||
|
# Then base64 encode it:
|
||||||
|
# echo -n '<htpasswd-output>' | base64
|
||||||
|
# Update the secret below with the new value.
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: basic-auth-secret
|
||||||
|
namespace: victoriametrics
|
||||||
|
type: Opaque
|
||||||
|
# Generate with: htpasswd -nb vultr_vm <password> | base64
|
||||||
|
# See .env for credentials
|
||||||
|
stringData:
|
||||||
|
users: |-
|
||||||
|
vultr_vm:$apr1$ZtK5B1K4$SCWPgREqKwfcrCr4FA6En1
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
kind: Middleware
|
||||||
|
metadata:
|
||||||
|
name: basic-auth
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
basicAuth:
|
||||||
|
secret: basic-auth-secret
|
||||||
17
victoriametrics/05-backfill-secrets.yaml
Normal file
17
victoriametrics/05-backfill-secrets.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Secrets for backfill (Mimir credentials)
|
||||||
|
# IMPORTANT: Update the password before running!
|
||||||
|
#
|
||||||
|
# To create the secret:
|
||||||
|
# kubectl create secret generic backfill-credentials \
|
||||||
|
# --from-literal=mimir-password='YOUR_PASSWORD' -n victoriametrics
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: backfill-credentials
|
||||||
|
namespace: victoriametrics
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
mimir-password: "REPLACE_WITH_MIMIR_PASSWORD"
|
||||||
185
victoriametrics/README.md
Normal file
185
victoriametrics/README.md
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
# VictoriaMetrics — Historical Metrics Store
|
||||||
|
|
||||||
|
VictoriaMetrics instance for querying historical vLLM + DCGM metrics (March 13, 2026 onward) that couldn't be backfilled into M3DB.
|
||||||
|
|
||||||
|
## Why VictoriaMetrics Instead of M3DB?
|
||||||
|
|
||||||
|
M3DB doesn't support backfill. Period. See the [main README](../README.md#why-backfill-doesnt-work) for the full story.
|
||||||
|
|
||||||
|
VictoriaMetrics has a first-class `/api/v1/import` endpoint that accepts data with any timestamp — no `bufferPast` gates, no block size hacks, no special namespaces. You just send the data and it works.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────┐
|
||||||
|
│ Vultr VKE Cluster │
|
||||||
|
│ │
|
||||||
|
Mimir ──import──▶ VictoriaMetrics (1 pod, 200Gi NVMe) │
|
||||||
|
│ ↓ PromQL queries │
|
||||||
|
│ Traefik (TLS + basic auth) │
|
||||||
|
│ ↓ │
|
||||||
|
│ vm.vultrlabs.dev │
|
||||||
|
└─────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
Grafana queries both:
|
||||||
|
- M3DB (m3db.vultrlabs.dev) → real-time data (1h blocks, going forward)
|
||||||
|
- VictoriaMetrics (vm.vultrlabs.dev) → historical data (Mar 13–present)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Deploy VictoriaMetrics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Apply manifests
|
||||||
|
kubectl apply -k .
|
||||||
|
|
||||||
|
# Wait for pod to be running
|
||||||
|
kubectl -n victoriametrics get pods -w
|
||||||
|
|
||||||
|
# Verify it's healthy
|
||||||
|
kubectl -n victoriametrics port-forward svc/victoriametrics 8428:8428 &
|
||||||
|
curl http://localhost:8428/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure DNS
|
||||||
|
|
||||||
|
Get the Traefik LoadBalancer IP and point `vm.vultrlabs.dev` at it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl -n traefik get svc traefik
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Set Up Basic Auth
|
||||||
|
|
||||||
|
Generate htpasswd and update the secret in `04-basic-auth-middleware.yaml`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
htpasswd -nb vultr_vm <your-password>
|
||||||
|
# Copy output, base64 encode it:
|
||||||
|
echo -n '<htpasswd-output>' | base64
|
||||||
|
# Update the secret and apply
|
||||||
|
kubectl apply -f 04-basic-auth-middleware.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Run Backfill
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the secret with Mimir credentials
|
||||||
|
kubectl create secret generic backfill-credentials \
|
||||||
|
--from-literal=mimir-password='YOUR_MIMIR_PASSWORD' -n victoriametrics
|
||||||
|
|
||||||
|
# Upload the backfill script as a configmap
|
||||||
|
kubectl create configmap backfill-script \
|
||||||
|
--from-file=backfill.py=backfill.py -n victoriametrics
|
||||||
|
|
||||||
|
# Run the backfill pod
|
||||||
|
kubectl apply -f backfill-pod.yaml
|
||||||
|
|
||||||
|
# Watch progress
|
||||||
|
kubectl logs -f backfill -n victoriametrics
|
||||||
|
|
||||||
|
# Cleanup when done
|
||||||
|
kubectl delete pod backfill -n victoriametrics
|
||||||
|
kubectl delete configmap backfill-script -n victoriametrics
|
||||||
|
kubectl delete secret backfill-credentials -n victoriametrics
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Verify
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# In-cluster
|
||||||
|
kubectl -n victoriametrics exec deploy/victoriametrics -- \
|
||||||
|
curl -s 'http://localhost:8428/api/v1/query?query=vllm:prompt_tokens_total' | python3 -m json.tool
|
||||||
|
|
||||||
|
# External (with auth)
|
||||||
|
curl -u vultr_vm:<password> "https://vm.vultrlabs.dev/api/v1/query?query=up"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Grafana Configuration
|
||||||
|
|
||||||
|
Add VictoriaMetrics as a **Prometheus** datasource:
|
||||||
|
|
||||||
|
- **URL:** `https://vm.vultrlabs.dev` (with basic auth)
|
||||||
|
- **In-cluster URL:** `http://victoriametrics.victoriametrics.svc.cluster.local:8428`
|
||||||
|
|
||||||
|
### Mixed Queries (M3DB + VictoriaMetrics)
|
||||||
|
|
||||||
|
Use a **Mixed** datasource in Grafana to query both:
|
||||||
|
|
||||||
|
1. Create two Prometheus datasources:
|
||||||
|
- `M3DB` → `https://m3db.vultrlabs.dev`
|
||||||
|
- `VictoriaMetrics` → `https://vm.vultrlabs.dev`
|
||||||
|
|
||||||
|
2. Create a **Mixed** datasource that includes both
|
||||||
|
|
||||||
|
3. In dashboards, use the mixed datasource — Grafana sends the query to both backends and merges results
|
||||||
|
|
||||||
|
Alternatively, use dashboard variables to let users toggle between datasources for different time ranges.
|
||||||
|
|
||||||
|
## Metrics Stored
|
||||||
|
|
||||||
|
| Metric | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `vllm:prompt_tokens_total` | vLLM prompt token count |
|
||||||
|
| `vllm:generation_tokens_total` | vLLM generation token count |
|
||||||
|
| `DCGM_FI_DEV_GPU_UTIL` | GPU utilization (DCGM) |
|
||||||
|
|
||||||
|
All metrics are tagged with `tenant=serverless-inference-cluster` and `cluster=serverless-inference-cluster`.
|
||||||
|
|
||||||
|
## VictoriaMetrics API Reference
|
||||||
|
|
||||||
|
| Endpoint | Purpose |
|
||||||
|
|----------|---------|
|
||||||
|
| `/api/v1/import` | Import data (Prometheus format) |
|
||||||
|
| `/api/v1/export` | Export data |
|
||||||
|
| `/api/v1/query` | PromQL instant query |
|
||||||
|
| `/api/v1/query_range` | PromQL range query |
|
||||||
|
| /health | Health check |
|
||||||
|
| /metrics | Internal metrics |
|
||||||
|
|
||||||
|
## Storage
|
||||||
|
|
||||||
|
- **Size:** 200Gi NVMe (Vultr Block Storage)
|
||||||
|
- **StorageClass:** `vultr-block-storage-vm` (Retain policy — data survives PVC deletion)
|
||||||
|
- **Retention:** 2 years
|
||||||
|
- **Volume expansion:** `kubectl edit pvc victoriametrics-data -n victoriametrics`
|
||||||
|
|
||||||
|
## Useful Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check VM health
|
||||||
|
kubectl -n victoriametrics exec deploy/victoriametrics -- curl -s http://localhost:8428/health
|
||||||
|
|
||||||
|
# Check storage stats
|
||||||
|
kubectl -n victoriametrics exec deploy/victoriametrics -- \
|
||||||
|
curl -s 'http://localhost:8428/api/v1/query?query=vm_rows' | python3 -m json.tool
|
||||||
|
|
||||||
|
# Query historical data
|
||||||
|
curl -u vultr_vm:<password> \
|
||||||
|
"https://vm.vultrlabs.dev/api/v1/query_range?query=vllm:prompt_tokens_total&start=1773360000&end=1742000000&step=60"
|
||||||
|
|
||||||
|
# Restart VM (if needed)
|
||||||
|
kubectl rollout restart deployment/victoriametrics -n victoriametrics
|
||||||
|
|
||||||
|
# Scale to 0 (preserve data, stop the pod)
|
||||||
|
kubectl scale deployment/victoriametrics --replicas=0 -n victoriametrics
|
||||||
|
```
|
||||||
|
|
||||||
|
## Re-running Backfill
|
||||||
|
|
||||||
|
If you need to import additional time ranges or new metrics:
|
||||||
|
|
||||||
|
1. Edit `backfill.py` — update `START_TS`, `END_TS`, or `METRICS`
|
||||||
|
2. Recreate the configmap and pod (see step 4 above)
|
||||||
|
3. VictoriaMetrics is idempotent for imports — duplicate data points are merged, not duplicated
|
||||||
|
|
||||||
|
To convert timestamps:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Date → Unix timestamp
|
||||||
|
date -u -d '2026-03-13 00:00:00' +%s # 1773360000
|
||||||
|
|
||||||
|
# Unix timestamp → date
|
||||||
|
date -u -d @1773360000
|
||||||
|
```
|
||||||
46
victoriametrics/backfill-pod.yaml
Normal file
46
victoriametrics/backfill-pod.yaml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Backfill Pod — One-shot job to import historical metrics from Mimir
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# kubectl create configmap backfill-script \
|
||||||
|
# --from-file=backfill.py=backfill.py -n victoriametrics
|
||||||
|
# kubectl apply -f backfill-pod.yaml
|
||||||
|
# kubectl logs -f backfill -n victoriametrics
|
||||||
|
#
|
||||||
|
# Cleanup:
|
||||||
|
# kubectl delete pod backfill -n victoriametrics
|
||||||
|
# kubectl delete configmap backfill-script -n victoriametrics
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: backfill
|
||||||
|
namespace: victoriametrics
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
containers:
|
||||||
|
- name: backfill
|
||||||
|
image: python:3.12-slim
|
||||||
|
command: ["python3", "/scripts/backfill.py"]
|
||||||
|
env:
|
||||||
|
- name: MIMIR_USERNAME
|
||||||
|
value: "vultr_sea_inference"
|
||||||
|
- name: MIMIR_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: backfill-credentials
|
||||||
|
key: mimir-password
|
||||||
|
- name: VM_URL
|
||||||
|
value: "http://victoriametrics.victoriametrics.svc.cluster.local:8428"
|
||||||
|
- name: START_TS
|
||||||
|
value: "1773360000" # 2026-03-13T00:00:00Z
|
||||||
|
- name: CHUNK_HOURS
|
||||||
|
value: "6"
|
||||||
|
volumeMounts:
|
||||||
|
- name: script
|
||||||
|
mountPath: /scripts
|
||||||
|
volumes:
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: backfill-script
|
||||||
211
victoriametrics/backfill.py
Normal file
211
victoriametrics/backfill.py
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Backfill historical metrics from Mimir to VictoriaMetrics.
|
||||||
|
|
||||||
|
Uses VictoriaMetrics /api/v1/import endpoint which happily accepts
|
||||||
|
data with any timestamp — no bufferPast gates, no block size hacks.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Run in-cluster (as a pod, see backfill-pod.yaml)
|
||||||
|
python3 backfill.py
|
||||||
|
|
||||||
|
# Or locally with port-forward
|
||||||
|
kubectl port-forward -n victoriametrics svc/victoriametrics 8428:8428
|
||||||
|
VM_URL=http://localhost:8428 python3 backfill.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import ssl
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import base64
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ── Configuration ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
MIMIR_URL = os.environ.get("MIMIR_URL", "https://metrics.vultrlabs.com/prometheus")
|
||||||
|
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||||
|
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||||
|
|
||||||
|
VM_URL = os.environ.get("VM_URL", "http://victoriametrics.victoriametrics.svc.cluster.local:8428")
|
||||||
|
|
||||||
|
# Time range: March 13, 2026 00:00:00 UTC → now
|
||||||
|
START_TS = int(os.environ.get("START_TS", "1773360000")) # 2026-03-13T00:00:00Z
|
||||||
|
END_TS = int(os.environ.get("END_TS", str(int(time.time()))))
|
||||||
|
|
||||||
|
STEP = os.environ.get("STEP", "10s")
|
||||||
|
CHUNK_HOURS = int(os.environ.get("CHUNK_HOURS", "6"))
|
||||||
|
|
||||||
|
# Metrics to backfill
|
||||||
|
METRICS = [
|
||||||
|
"vllm:prompt_tokens_total",
|
||||||
|
"vllm:generation_tokens_total",
|
||||||
|
"DCGM_FI_DEV_GPU_UTIL",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Extra labels to add to all imported data (e.g. tenant/cluster context)
|
||||||
|
EXTRA_LABELS = {
|
||||||
|
"tenant": "serverless-inference-cluster",
|
||||||
|
"cluster": "serverless-inference-cluster",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def ssl_ctx():
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
def mimir_query(path):
|
||||||
|
"""Query Mimir API with basic auth."""
|
||||||
|
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||||
|
req = urllib.request.Request(f"{MIMIR_URL}{path}")
|
||||||
|
req.add_header("Authorization", f"Basic {auth}")
|
||||||
|
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
def vm_import(lines):
|
||||||
|
"""Push data to VictoriaMetrics /api/v1/import."""
|
||||||
|
data = "\n".join(lines).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{VM_URL}/api/v1/import",
|
||||||
|
data=data,
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
req.add_header("Content-Type", "application/octet-stream")
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=300)
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body = e.read().decode()[:200]
|
||||||
|
print(f" VM import ERROR {e.code}: {body}", flush=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def format_prom_metric_name(raw_name):
|
||||||
|
"""Convert Mimir metric name to valid Prometheus metric name for VM.
|
||||||
|
|
||||||
|
VictoriaMetrics import format uses: metric_name{label1="val1",...} timestamp value
|
||||||
|
Colons in metric names are valid in Prometheus but we keep them as-is since
|
||||||
|
VM handles them fine.
|
||||||
|
"""
|
||||||
|
return raw_name
|
||||||
|
|
||||||
|
# ── Main ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
print(f"VictoriaMetrics Backfill", flush=True)
|
||||||
|
print(f"========================", flush=True)
|
||||||
|
print(f"Source: {MIMIR_URL}", flush=True)
|
||||||
|
print(f"Target: {VM_URL}", flush=True)
|
||||||
|
print(f"Range: {START_TS} → {END_TS} ({CHUNK_HOURS}h chunks)", flush=True)
|
||||||
|
print(f"Metrics: {', '.join(METRICS)}", flush=True)
|
||||||
|
print(f"Extra labels: {EXTRA_LABELS}", flush=True)
|
||||||
|
print(flush=True)
|
||||||
|
|
||||||
|
total_samples = 0
|
||||||
|
total_errors = 0
|
||||||
|
|
||||||
|
for metric in METRICS:
|
||||||
|
print(f"\n{'='*60}", flush=True)
|
||||||
|
print(f"Metric: {metric}", flush=True)
|
||||||
|
print(f"{'='*60}", flush=True)
|
||||||
|
|
||||||
|
metric_samples = 0
|
||||||
|
chunk_start = START_TS
|
||||||
|
|
||||||
|
while chunk_start < END_TS:
|
||||||
|
chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
|
||||||
|
chunk_label = f"[{time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_start))} → {time.strftime('%Y-%m-%d %H:%M', time.gmtime(chunk_end))}]"
|
||||||
|
print(f" {chunk_label} ...", end="", flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = (
|
||||||
|
f"/api/v1/query_range?"
|
||||||
|
f"query={urllib.parse.quote(metric)}"
|
||||||
|
f"&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||||||
|
)
|
||||||
|
data = mimir_query(path)
|
||||||
|
|
||||||
|
if data.get("status") != "success":
|
||||||
|
print(f" Mimir returned status={data.get('status')}", flush=True)
|
||||||
|
chunk_start = chunk_end
|
||||||
|
continue
|
||||||
|
|
||||||
|
series_list = data["data"]["result"]
|
||||||
|
if not series_list:
|
||||||
|
print(f" no data", flush=True)
|
||||||
|
chunk_start = chunk_end
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build import lines in VictoriaMetrics native format
|
||||||
|
# Format: metric_name{label1="val1",label2="val2"} timestamp value
|
||||||
|
import_lines = []
|
||||||
|
chunk_count = 0
|
||||||
|
|
||||||
|
for series in series_list:
|
||||||
|
labels = dict(series["metric"])
|
||||||
|
# Remove __name__ from labels (it's the metric name)
|
||||||
|
metric_name = labels.pop("__name__", metric)
|
||||||
|
|
||||||
|
# Add extra labels
|
||||||
|
labels.update(EXTRA_LABELS)
|
||||||
|
|
||||||
|
# Build label string
|
||||||
|
label_parts = [f'{k}="{v}"' for k, v in sorted(labels.items())]
|
||||||
|
label_str = ",".join(label_parts)
|
||||||
|
|
||||||
|
# Build import lines: one per sample
|
||||||
|
for ts_str, val_str in series["values"]:
|
||||||
|
# Convert timestamp (seconds) to ms for VM
|
||||||
|
ts_ms = int(float(ts_str) * 1000)
|
||||||
|
try:
|
||||||
|
val = float(val_str)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
# Handle +Inf, -Inf, NaN
|
||||||
|
if val_str == "+Inf":
|
||||||
|
val = float("inf")
|
||||||
|
elif val_str == "-Inf":
|
||||||
|
val = float("-inf")
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
import_lines.append(f'{metric_name}{{{label_str}}} {ts_ms} {val_str}')
|
||||||
|
chunk_count += 1
|
||||||
|
|
||||||
|
if import_lines:
|
||||||
|
ok = vm_import(import_lines)
|
||||||
|
if ok:
|
||||||
|
print(f" {chunk_count} samples imported", flush=True)
|
||||||
|
metric_samples += chunk_count
|
||||||
|
else:
|
||||||
|
print(f" IMPORT FAILED ({chunk_count} samples lost)", flush=True)
|
||||||
|
total_errors += chunk_count
|
||||||
|
else:
|
||||||
|
print(f" 0 samples", flush=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}", flush=True)
|
||||||
|
total_errors += 1
|
||||||
|
|
||||||
|
chunk_start = chunk_end
|
||||||
|
|
||||||
|
print(f" Total for {metric}: {metric_samples} samples", flush=True)
|
||||||
|
total_samples += metric_samples
|
||||||
|
|
||||||
|
print(f"\n{'='*60}", flush=True)
|
||||||
|
print(f"BACKFILL COMPLETE", flush=True)
|
||||||
|
print(f"Total samples imported: {total_samples}", flush=True)
|
||||||
|
print(f"Total errors: {total_errors}", flush=True)
|
||||||
|
print(f"{'='*60}", flush=True)
|
||||||
|
|
||||||
|
# Verify by querying VM
|
||||||
|
print(f"\nVerifying import...", flush=True)
|
||||||
|
try:
|
||||||
|
verify_path = f"/api/v1/query?query={urllib.parse.quote('count(up)')}"
|
||||||
|
req = urllib.request.Request(f"{VM_URL}{verify_path}")
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
print(f"VM is responding to queries ✓", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"VM query check failed: {e}", flush=True)
|
||||||
9
victoriametrics/kustomization.yaml
Normal file
9
victoriametrics/kustomization.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- 00-namespace.yaml
|
||||||
|
- 01-storageclass.yaml
|
||||||
|
- 02-deployment.yaml
|
||||||
|
- 03-ingressroute.yaml
|
||||||
|
- 04-basic-auth-middleware.yaml
|
||||||
Reference in New Issue
Block a user