tweaks with backfill and grafana

2026-04-01 15:21:10 +00:00
parent a6c59d6a65
commit 1af29e8f09
16 changed files with 944 additions and 119 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,18 @@
+# M3DB Cluster Credentials
+# ========================
+# Copy this file to .env and fill in your values
+
+# M3DB Basic Auth (coordinator API access)
+M3DB_USERNAME=<your-m3db-username>
+M3DB_PASSWORD=<your-m3db-password>
+
+# Mimir (source for backfill)
+MIMIR_USERNAME=<your-mimir-username>
+MIMIR_PASSWORD=<your-mimir-password>
+
+# Grafana Admin
+GRAFANA_ADMIN_PASSWORD=<your-grafana-admin-password>
+
+# M3DB Basic Auth (htpasswd base64)
+# Generate with: echo -n "username:password" | base64
+M3DB_HTPASSWD_B64=<base64-encoded-htpasswd>
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-kubeconfig.yaml
+kubeconfig.yaml
+.env
--- a/04-m3dbnode.yaml
+++ b/04-m3dbnode.yaml
@@ -111,11 +111,11 @@ spec:
              mountPath: /var/lib/m3kv
          resources:
            requests:
-              cpu: "1"
-              memory: 4Gi
-            limits:
-              cpu: "2"
+              cpu: "4"
              memory: 8Gi
+            limits:
+              cpu: "8"
+              memory: 20Gi
          livenessProbe:
            httpGet:
              path: /health
--- a/05-m3coordinator.yaml
+++ b/05-m3coordinator.yaml
@@ -1,117 +1,70 @@
-##############################################################################
-# M3 Coordinator — Deployment
-# Stateless query/write layer — Prometheus remote_write & remote_read target
-# This is what Grafana and Prometheus talk to (replaces Mimir endpoints)
-##############################################################################
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: m3coordinator
-  namespace: m3db
-  labels:
-    app.kubernetes.io/name: m3coordinator
-    app.kubernetes.io/part-of: m3db
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: m3coordinator
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: m3coordinator
-        app.kubernetes.io/part-of: m3db
-      annotations:
-        prometheus.io/scrape: "true"
-        prometheus.io/port: "7203"
-    spec:
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              podAffinityTerm:
-                labelSelector:
-                  matchExpressions:
-                    - key: app.kubernetes.io/name
-                      operator: In
-                      values:
-                        - m3coordinator
-                topologyKey: kubernetes.io/hostname
-      containers:
-        - name: m3coordinator
-          image: quay.io/m3db/m3coordinator:v1.5.0
-          imagePullPolicy: IfNotPresent
-          args:
-            - "-f"
-            - "/etc/m3coordinator/m3coordinator.yml"
-          ports:
-            - containerPort: 7201
-              name: api
-              protocol: TCP
-            - containerPort: 7203
-              name: metrics
-              protocol: TCP
-          volumeMounts:
-            - name: config
-              mountPath: /etc/m3coordinator
-            - name: cache-dir
-              mountPath: /var/lib/m3kv
-          resources:
-            requests:
-              cpu: 500m
-              memory: 1Gi
-            limits:
-              cpu: "1"
-              memory: 2Gi
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: 7201
-            initialDelaySeconds: 30
-            periodSeconds: 10
-          readinessProbe:
-            httpGet:
-              path: /health
-              port: 7201
-            initialDelaySeconds: 10
-            periodSeconds: 5
-      volumes:
-        - name: config
-          configMap:
-            name: m3coordinator-config
-        - name: cache-dir
-          emptyDir: {}
-
---
-
-##############################################################################
-# M3 Coordinator Service
-# Endpoints for Prometheus remote_write / remote_read / Grafana
-#
-# remote_write → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/write
-# remote_read  → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/read
-# query (Grafana Prometheus datasource) → http://m3coordinator.m3db.svc.cluster.local:7201
-##############################################################################
-
 apiVersion: v1
-kind: Service
+kind: ConfigMap
 metadata:
-  name: m3coordinator
+  name: m3coordinator-config
  namespace: m3db
  labels:
    app.kubernetes.io/name: m3coordinator
-    app.kubernetes.io/part-of: m3db
-spec:
-  type: ClusterIP
-  ports:
-    - name: api
-      port: 7201
-      targetPort: 7201
-      protocol: TCP
-    - name: metrics
-      port: 7203
-      targetPort: 7203
-      protocol: TCP
-  selector:
-    app.kubernetes.io/name: m3coordinator
+data:
+  m3coordinator.yml: |
+    listenAddress: 0.0.0.0:7201
+
+    logging:
+      level: info
+
+    metrics:
+      scope:
+        prefix: coordinator
+      prometheus:
+        handlerPath: /metrics
+        listenAddress: 0.0.0.0:7203
+      sanitization: prometheus
+      samplingRate: 1.0
+
+    tagOptions:
+      idScheme: quoted
+
+    clusters:
+      - namespaces:
+          - namespace: default
+            type: unaggregated
+            retention: 720h
+          - namespace: agg_1m_60d
+            type: aggregated
+            retention: 1440h
+            resolution: 1m
+          - namespace: agg_1h_1y
+            type: aggregated
+            retention: 8760h
+            resolution: 1h
+        client:
+          config:
+            service:
+              env: default_env
+              zone: embedded
+              service: m3db
+              cacheDir: /var/lib/m3kv
+              etcdClusters:
+                - zone: embedded
+                  endpoints:
+                    - http://etcd-0.etcd.m3db.svc.cluster.local:2379
+                    - http://etcd-1.etcd.m3db.svc.cluster.local:2379
+                    - http://etcd-2.etcd.m3db.svc.cluster.local:2379
+          writeConsistencyLevel: majority
+          readConsistencyLevel: unstrict_majority
+
+    downsample:
+      rules:
+        mappingRules:
+          - name: "1min for 60 days"
+            filter: "__name__:*"
+            aggregations: ["Last"]
+            storagePolicies:
+              - resolution: 1m
+                retention: 1440h
+          - name: "1hour for 1 year"
+            filter: "__name__:*"
+            aggregations: ["Last"]
+            storagePolicies:
+              - resolution: 1h
+                retention: 8760h
--- a/08-basic-auth-middleware.yaml
+++ b/08-basic-auth-middleware.yaml
@@ -16,10 +16,11 @@ metadata:
  name: basic-auth-secret
  namespace: m3db
 type: Opaque
-# htpasswd -nb example example
+# Generate with: htpasswd -nb vultr_m3db <password> | base64
+# See .env for credentials
 stringData:
  users: |-
-    example:$apr1$oMBgtfpd$CBTS17sDq7GN58qaoIMvh.
+    vultr_m3db:$apr1$xyz$tempplaceholderREPLACEFROMENV

 ---
 apiVersion: traefik.io/v1alpha1
--- a/10-grafana.yaml
+++ b/10-grafana.yaml
@@ -0,0 +1,163 @@
+##############################################################################
+# Grafana - Visualization for M3DB metrics
+# Deployed on dedicated grafana nodepool
+# Exposed via LoadBalancer (no TLS - Grafana has built-in auth)
+##############################################################################
+
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: grafana
+  labels:
+    app.kubernetes.io/name: grafana
+
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: grafana-admin
+  namespace: grafana
+type: Opaque
+stringData:
+  admin-user: admin
+  # REPLACE: Set from .env GRAFANA_ADMIN_PASSWORD
+  admin-password: "REPLACE_WITH_GRAFANA_ADMIN_PASSWORD"
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: grafana
+  labels:
+    grafana_datasource: "1"
+data:
+  datasources.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: M3DB
+        type: prometheus
+        access: proxy
+        url: http://m3coordinator.m3db:7201
+        basicAuth: true
+        # REPLACE: Set from .env M3DB_USERNAME and M3DB_PASSWORD
+        basicAuthUser: REPLACE_WITH_M3DB_USERNAME
+        secureJsonData:
+          basicAuthPassword: 'REPLACE_WITH_M3DB_PASSWORD'
+        isDefault: true
+        editable: false
+
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-storage
+  namespace: grafana
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+  storageClassName: vultr-block-storage
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grafana
+  namespace: grafana
+  labels:
+    app.kubernetes.io/name: grafana
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: grafana
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: grafana
+    spec:
+      # Schedule only on grafana nodepool
+      nodeSelector:
+        vke.vultr.com/node-pool: grafana
+      securityContext:
+        fsGroup: 472
+        runAsUser: 472
+        runAsGroup: 472
+      containers:
+        - name: grafana
+          image: grafana/grafana:11.5.2
+          ports:
+            - name: http
+              containerPort: 3000
+              protocol: TCP
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-admin
+                  key: admin-user
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: grafana-admin
+                  key: admin-password
+            - name: GF_AUTH_ANONYMOUS_ENABLED
+              value: "false"
+            - name: GF_SERVER_ROOT_URL
+              value: "%(protocol)s://%(domain)s:%(http_port)s/"
+            - name: GF_INSTALL_PLUGINS
+              value: ""
+          volumeMounts:
+            - name: storage
+              mountPath: /var/lib/grafana
+            - name: datasources
+              mountPath: /etc/grafana/provisioning/datasources
+              readOnly: true
+          resources:
+            requests:
+              cpu: 250m
+              memory: 512Mi
+            limits:
+              cpu: 500m
+              memory: 1Gi
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+      volumes:
+        - name: storage
+          persistentVolumeClaim:
+            claimName: grafana-storage
+        - name: datasources
+          configMap:
+            name: grafana-datasources
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: grafana
+  labels:
+    app.kubernetes.io/name: grafana
+spec:
+  type: LoadBalancer
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
+      protocol: TCP
+  selector:
+    app.kubernetes.io/name: grafana
--- a/backfill/BACKFILL_RUNBOOK.md
+++ b/backfill/BACKFILL_RUNBOOK.md
@@ -0,0 +1,171 @@
+# M3DB Backfill Runbook (Revised)
+
+## Context
+
+Backfilling ~3 weeks of vLLM + DCGM metrics from Mimir to M3DB.
+
+**Blocker discovered:** `bufferPast` is immutable on existing namespaces. Downsample pipeline rejects historical writes.
+
+**Solution:** Create new backfill namespaces with `bufferPast=504h` (21 days).
+
+---
+
+## Step 1 — Create Backfill Namespaces
+
+```bash
+COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
+
+# default_backfill: 7d retention, 21d bufferPast
+curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "default_backfill",
+    "options": {
+      "retentionOptions": {
+        "retentionPeriodDuration": "168h",
+        "blockSizeDuration": "2h",
+        "bufferFutureDuration": "10m",
+        "bufferPastDuration": "504h"
+      }
+    }
+  }'
+
+# agg_10s_backfill: 90d retention, 10s resolution, 21d bufferPast
+curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "agg_10s_backfill",
+    "options": {
+      "retentionOptions": {
+        "retentionPeriodDuration": "2160h",
+        "blockSizeDuration": "24h",
+        "bufferFutureDuration": "10m",
+        "bufferPastDuration": "504h"
+      }
+    },
+    "aggregationOptions": {
+      "aggregations": [{
+        "aggregated": true,
+        "attributes": {
+          "resolutionNanos": "10000000000",
+          "downsampleOptions": {"all": true}
+        }
+      }]
+    }
+  }'
+
+# agg_1m_backfill: 1y retention, 1m resolution, 21d bufferPast
+curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "agg_1m_backfill",
+    "options": {
+      "retentionOptions": {
+        "retentionPeriodDuration": "8760h",
+        "blockSizeDuration": "24h",
+        "bufferFutureDuration": "10m",
+        "bufferPastDuration": "504h"
+      }
+    },
+    "aggregationOptions": {
+      "aggregations": [{
+        "aggregated": true,
+        "attributes": {
+          "resolutionNanos": "60000000000",
+          "downsampleOptions": {"all": true}
+        }
+      }]
+    }
+  }'
+```
+
+---
+
+## Step 2 — Update Coordinator ConfigMap
+
+Add new namespaces to `m3coordinator-config`:
+
+```yaml
+clusters:
+  - namespaces:
+      - namespace: default
+        type: unaggregated
+        retention: 168h
+      - namespace: default_backfill
+        type: unaggregated
+        retention: 168h
+      - namespace: agg_10s_30d
+        type: aggregated
+        retention: 2160h
+        resolution: 10s
+      - namespace: agg_10s_backfill
+        type: aggregated
+        retention: 2160h
+        resolution: 10s
+      - namespace: agg_1m_1y
+        type: aggregated
+        retention: 8760h
+        resolution: 1m
+      - namespace: agg_1m_backfill
+        type: aggregated
+        retention: 8760h
+        resolution: 1m
+```
+
+Also add downsample rules for backfill namespaces.
+
+---
+
+## Step 3 — Restart Coordinators
+
+```bash
+kubectl rollout restart deployment/m3coordinator -n m3db
+kubectl rollout status deployment/m3coordinator -n m3db --timeout=120s
+```
+
+---
+
+## Step 4 — Run Backfill
+
+Write directly to `default_backfill` namespace using `__namespace__` label:
+
+```python
+# In the protobuf write request, add label:
+# __namespace__ = "default_backfill"
+```
+
+Or use the coordinator endpoint:
+```
+POST http://m3coordinator:7201/api/v1/prom/remote/write?namespace=default_backfill
+```
+
+Backfill time range: `2026-03-11T00:00:00Z` to `2026-04-01T00:00:00Z`
+
+---
+
+## Step 5 — Verify
+
+```bash
+curl -sS "http://m3coordinator:7201/api/v1/query" \
+  --data-urlencode 'query=vllm:prompt_tokens_total' \
+  --data-urlencode 'time=2026-03-20T12:00:00Z'
+```
+
+---
+
+## Step 6 — Revert bufferPast (After Backfill)
+
+```bash
+# After backfill complete, shrink bufferPast back to 10m
+# (Only retentionPeriod is mutable, so this requires namespace recreation)
+# OR: Leave as-is since it's a backfill-only namespace
+```
+
+---
+
+## Performance Notes
+
+- M3DB has been fast so far
+- New namespaces won't impact existing query performance
+- Queries can fan out to both old and new namespaces in parallel
+- After backfill, consider consolidating (optional)
--- a/backfill/README.md
+++ b/backfill/README.md
@@ -0,0 +1,87 @@
+# M3DB Backfill Tools
+
+Scripts to backfill historical metrics from Mimir to M3DB.
+
+## Prerequisites
+
+Copy `.env` and set credentials:
+```bash
+cp .env.example .env
+# Edit .env with your credentials
+```
+
+Required environment variables:
+- `MIMIR_USERNAME` - Mimir API username
+- `MIMIR_PASSWORD` - Mimir API password
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `backfill.py` | Main backfill script — pulls from Mimir, writes to M3DB |
+| `backfill-gap.py` | Lightweight script for filling specific time gaps |
+| `backfill-pod.yaml` | Kubernetes pod manifest for running backfill |
+| `BACKFILL_RUNBOOK.md` | Detailed runbook with lessons learned |
+| `test-metrics.py` | Test script for verifying data flow |
+
+## Quick Usage
+
+### Full Backfill
+
+```bash
+# Edit START_TS and END_TS in backfill.py first
+# Format: Unix timestamps (seconds since epoch)
+
+# Create configmap and run
+kubectl create configmap backfill-script --from-file=backfill.py=backfill.py -n m3db
+kubectl apply -f backfill-pod.yaml
+kubectl logs -f backfill -n m3db
+```
+
+### Fill a Specific Gap
+
+Edit `backfill-gap.py` to set your time range:
+
+```python
+START_TS = 1774175400  # Unix timestamp
+END_TS = 1774243800    # Unix timestamp
+```
+
+Then run:
+
+```bash
+kubectl create configmap backfill-gap-script --from-file=backfill-gap.py=backfill-gap.py -n m3db
+kubectl apply -f backfill-gap-pod.yaml
+kubectl logs -f backfill-gap -n m3db
+```
+
+## Timestamp Helpers
+
+```bash
+# Convert date to Unix timestamp
+date -u -d '2026-03-22 10:30:00' +%s
+
+# Convert Unix timestamp to date
+date -u -d @1774175400
+```
+
+## Requirements
+
+- Mimir credentials (in script)
+- M3DB coordinator endpoint: `http://m3coordinator.m3db.svc.cluster.local:7201`
+- `bufferPast` must be >= the age of data you're backfilling (currently 21 days)
+
+## Metrics Backfilled
+
+- `vllm:prompt_tokens_total`
+- `vllm:generation_tokens_total`
+- `DCGM_FI_DEV_GPU_UTIL`
+
+## Cleanup
+
+After backfill completes:
+
+```bash
+kubectl delete pod backfill -n m3db
+kubectl delete configmap backfill-script -n m3db
+```
--- a/backfill/backfill-gap-pod.yaml
+++ b/backfill/backfill-gap-pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: backfill-gap
+  namespace: m3db
+spec:
+  restartPolicy: Never
+  volumes:
+    - name: script
+      configMap:
+        name: backfill-gap-script
+  containers:
+    - name: backfill
+      image: python:3.11-slim
+      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-gap.py"]
+      volumeMounts:
+        - name: script
+          mountPath: /scripts
--- a/backfill/backfill-gap.py
+++ b/backfill/backfill-gap.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""Quick backfill for April 1 gap (10:44-11:50 UTC)"""
+import struct
+import urllib.request
+import urllib.error
+import urllib.parse
+import json
+import ssl
+import snappy
+import base64
+
+# Read credentials from environment (see .env)
+import os
+MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
+MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
+MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
+M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
+
+START_TS = 1774175400  # 2026-03-22T10:30:00Z
+END_TS = 1774243800    # 2026-03-23T05:30:00Z
+STEP = "10s"
+
+METRICS = ["vllm:prompt_tokens_total", "vllm:generation_tokens_total", "DCGM_FI_DEV_GPU_UTIL"]
+
+def enc(v):
+    b = v & 0x7f
+    v >>= 7
+    r = b""
+    while v:
+        r += bytes([0x80 | b])
+        b = v & 0x7f
+        v >>= 7
+    return r + bytes([b])
+
+def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
+def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
+
+def build_ts(labels, samples):
+    ts = b""
+    for n, v in labels.items():
+        l = es(1, n.encode()) + es(2, v.encode())
+        ts += enc((1<<3)|2) + enc(len(l)) + l
+    for t_ms, val in samples:
+        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
+        ts += enc((2<<3)|2) + enc(len(s)) + s
+    return ts
+
+def ssl_ctx():
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    return ctx
+
+def mimir_req(path):
+    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
+    req = urllib.request.Request(f"{MIMIR_URL}{path}")
+    req.add_header("Authorization", f"Basic {auth}")
+    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
+    return json.loads(resp.read().decode())
+
+def write_m3db(data):
+    c = snappy.compress(data)
+    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
+    req.add_header("Content-Type", "application/x-protobuf")
+    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
+    req.add_header("Content-Encoding", "snappy")
+    try:
+        urllib.request.urlopen(req, timeout=300)
+        return True
+    except urllib.error.HTTPError as e:
+        print(f"  ERROR {e.code}: {e.read().decode()[:100]}")
+        return False
+
+print(f"Filling gap: {START_TS} to {END_TS}")
+total = 0
+
+for metric in METRICS:
+    print(f"{metric}...", end=" ", flush=True)
+    path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={START_TS}&end={END_TS}&step={STEP}"
+    data = mimir_req(path)
+    if data["status"] != "success":
+        print("failed")
+        continue
+    series = data["data"]["result"]
+    samples = sum(len(s["values"]) for s in series)
+    if samples > 0:
+        wr = b""
+        for s in series:
+            labels = dict(s["metric"])
+            labels["cluster"] = "serverless-inference-cluster"
+            pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
+            ts = build_ts(labels, pts)
+            wr += enc((1<<3)|2) + enc(len(ts)) + ts
+        if write_m3db(wr):
+            print(f"{samples} samples written")
+            total += samples
+    else:
+        print("no data")
+
+print(f"Done! Total: {total} samples")
--- a/backfill/backfill-massive-pod.yaml
+++ b/backfill/backfill-massive-pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: backfill-massive
+  namespace: m3db
+spec:
+  restartPolicy: Never
+  volumes:
+    - name: script
+      configMap:
+        name: backfill-massive-script
+  containers:
+    - name: backfill
+      image: python:3.11-slim
+      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-massive.py"]
+      volumeMounts:
+        - name: script
+          mountPath: /scripts
--- a/backfill/backfill-massive.py
+++ b/backfill/backfill-massive.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Massive backfill: March 12 - April 1, 2026
+Writes ONLY to 'default' namespace (raw data)
+Overlapping chunks - no gaps!
+"""
+import struct
+import urllib.request
+import urllib.error
+import urllib.parse
+import json
+import ssl
+import snappy
+import base64
+import time
+
+# Read credentials from environment (see .env)
+import os
+MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
+MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
+MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
+M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
+
+# March 12 to April 1 (full range)
+START_TS = 1773273600   # March 12 00:00 UTC
+END_TS = 1775052000     # April 1 14:00 UTC
+CHUNK_HOURS = 4         # 4-hour chunks
+OVERLAP_MINUTES = 30    # 30-min overlap between chunks
+STEP = "10s"
+
+METRICS = [
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "DCGM_FI_DEV_GPU_UTIL",
+]
+
+def enc(v):
+    b = v & 0x7f
+    v >>= 7
+    r = b""
+    while v:
+        r += bytes([0x80 | b])
+        b = v & 0x7f
+        v >>= 7
+    return r + bytes([b])
+
+def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
+def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
+
+def build_ts(labels, samples):
+    ts = b""
+    for n, v in labels.items():
+        l = es(1, n.encode()) + es(2, v.encode())
+        ts += enc((1<<3)|2) + enc(len(l)) + l
+    for t_ms, val in samples:
+        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
+        ts += enc((2<<3)|2) + enc(len(s)) + s
+    return ts
+
+def ssl_ctx():
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    return ctx
+
+def mimir_req(path):
+    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
+    url = f"{MIMIR_URL}{path}"
+    req = urllib.request.Request(url)
+    req.add_header("Authorization", f"Basic {auth}")
+    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
+    return json.loads(resp.read().decode())
+
+def write_m3db(data):
+    c = snappy.compress(data)
+    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
+    req.add_header("Content-Type", "application/x-protobuf")
+    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
+    req.add_header("Content-Encoding", "snappy")
+    # TARGET ONLY DEFAULT NAMESPACE
+    req.add_header("X-M3-Namespace", "default")
+    try:
+        urllib.request.urlopen(req, timeout=300)
+        return True
+    except urllib.error.HTTPError as e:
+        err = e.read().decode()[:200]
+        print(f"  WRITE ERROR {e.code}: {err}")
+        return False
+
+print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
+print(f"Range: March 12 - April 1, 2026")
+print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
+print(f"Metrics: {len(METRICS)}")
+print("="*60)
+
+total_samples = 0
+start_time = time.time()
+chunk_seconds = CHUNK_HOURS * 3600
+overlap_seconds = OVERLAP_MINUTES * 60
+
+for metric in METRICS:
+    print(f"\n{metric}:")
+    metric_samples = 0
+    chunk_num = 0
+    
+    chunk_start = START_TS
+    while chunk_start < END_TS:
+        chunk_end = min(chunk_start + chunk_seconds, END_TS)
+        chunk_num += 1
+        
+        path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
+        
+        try:
+            data = mimir_req(path)
+        except Exception as e:
+            print(f"  Chunk {chunk_num}: QUERY FAILED: {e}")
+            chunk_start = chunk_end - overlap_seconds
+            continue
+        
+        if data.get("status") != "success":
+            print(f"  Chunk {chunk_num}: status={data.get('status')}")
+            chunk_start = chunk_end - overlap_seconds
+            continue
+        
+        series = data["data"]["result"]
+        samples = sum(len(s["values"]) for s in series)
+        
+        if samples == 0:
+            chunk_start = chunk_end - overlap_seconds
+            continue
+        
+        wr = b""
+        for s in series:
+            labels = dict(s["metric"])
+            labels["cluster"] = "serverless-inference-cluster"
+            pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
+            ts = build_ts(labels, pts)
+            wr += enc((1<<3)|2) + enc(len(ts)) + ts
+        
+        if write_m3db(wr):
+            metric_samples += samples
+            hrs = (chunk_end - chunk_start) / 3600
+            print(f"  Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
+        
+        # Next chunk starts with overlap
+        chunk_start = chunk_end - overlap_seconds
+    
+    total_samples += metric_samples
+    print(f"  TOTAL {metric}: {metric_samples:,} samples")
+
+elapsed = time.time() - start_time
+print("="*60)
+print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")
--- a/backfill/backfill-pod.yaml
+++ b/backfill/backfill-pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: backfill-v2
+  namespace: m3db
+spec:
+  restartPolicy: Never
+  volumes:
+    - name: script
+      configMap:
+        name: backfill-script-v2
+  containers:
+    - name: backfill
+      image: python:3.11-slim
+      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill.py"]
+      volumeMounts:
+        - name: script
+          mountPath: /scripts
--- a/backfill/backfill.py
+++ b/backfill/backfill.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""M3DB Backfill - Pull vLLM/DCGM metrics from Mimir and write to M3DB"""
+import struct
+import urllib.request
+import urllib.error
+import urllib.parse
+import json
+import ssl
+import snappy
+import base64
+import sys
+
+print("Starting backfill script...", flush=True)
+
+# Read credentials from environment (see .env)
+import os
+MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
+MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
+MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
+M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
+
+START_TS = 1773187200  # 2026-03-11T00:00:00Z
+END_TS = 1775040000    # 2026-04-01T11:40:00Z (just before node restart)
+STEP = "10s"
+CHUNK_HOURS = 6
+
+METRICS = [
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "DCGM_FI_DEV_GPU_UTIL",
+]
+
+def enc(v):
+    b = v & 0x7f
+    v >>= 7
+    r = b""
+    while v:
+        r += bytes([0x80 | b])
+        b = v & 0x7f
+        v >>= 7
+    return r + bytes([b])
+
+def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
+def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
+
+def build_ts(labels, samples):
+    ts = b""
+    for n, v in labels.items():
+        l = es(1, n.encode()) + es(2, v.encode())
+        ts += enc((1<<3)|2) + enc(len(l)) + l
+    for t_ms, val in samples:
+        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
+        ts += enc((2<<3)|2) + enc(len(s)) + s
+    return ts
+
+def ssl_ctx():
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    return ctx
+
+def mimir_req(path):
+    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
+    req = urllib.request.Request(f"{MIMIR_URL}{path}")
+    req.add_header("Authorization", f"Basic {auth}")
+    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
+    return json.loads(resp.read().decode())
+
+def write_m3db(data):
+    c = snappy.compress(data)
+    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
+    req.add_header("Content-Type", "application/x-protobuf")
+    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
+    req.add_header("Content-Encoding", "snappy")
+    try:
+        resp = urllib.request.urlopen(req, timeout=300)
+        return True
+    except urllib.error.HTTPError as e:
+        print(f"  ERROR {e.code}: {e.read().decode()[:100]}", flush=True)
+        return False
+
+print(f"Time range: {START_TS} to {END_TS}", flush=True)
+total = 0
+
+for metric in METRICS:
+    print(f"\n{metric}...", flush=True)
+    metric_total = 0
+    chunk_start = START_TS
+    chunks_done = 0
+
+    while chunk_start < END_TS:
+        chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
+        try:
+            path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
+            data = mimir_req(path)
+            if data["status"] != "success":
+                chunk_start = chunk_end
+                continue
+
+            series = data["data"]["result"]
+            samples = sum(len(s["values"]) for s in series)
+            if samples > 0:
+                wr = b""
+                for s in series:
+                    labels = dict(s["metric"])
+                    labels["cluster"] = "serverless-inference-cluster"
+                    pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
+                    ts = build_ts(labels, pts)
+                    wr += enc((1<<3)|2) + enc(len(ts)) + ts
+                if write_m3db(wr):
+                    metric_total += samples
+                    chunks_done += 1
+                    if chunks_done % 10 == 0:
+                        print(f"  {chunks_done} chunks, {metric_total} samples...", flush=True)
+
+        except Exception as e:
+            print(f"  Chunk error: {e}", flush=True)
+
+        chunk_start = chunk_end
+
+    print(f"  Done: {metric_total} samples", flush=True)
+    total += metric_total
+
+print(f"\nBackfill complete! Total: {total} samples", flush=True)
--- a/backfill/test-metrics.py
+++ b/backfill/test-metrics.py
--- a/backfill/test-metrics.sh
+++ b/backfill/test-metrics.sh