Clean slate: 1h block sizes, remove backfill artifacts

- Changed all namespace block sizes to 1h (was 2h/12h/24h in manifests, 30d+ in the live cluster due to backfill-era bufferPast hacks) - Deleted entire backfill/ directory (scripts, pods, runbooks) - Removed stale 05-m3coordinator.yaml (had backfill namespaces) - Added 05-m3coordinator-deployment.yaml to kustomization - Fixed init job health check (/health instead of /api/v1/services/m3db/health) - Updated .env.example (removed Mimir credentials) - Added 'Why Backfill Doesn't Work' section to README
2026-04-09 19:00:08 +00:00
parent 1af29e8f09
commit 7ade5ecac8
16 changed files with 115 additions and 1117 deletions
--- a/.env.example
+++ b/.env.example
@@ -6,10 +6,6 @@
 M3DB_USERNAME=<your-m3db-username>
 M3DB_PASSWORD=<your-m3db-password>

-# Mimir (source for backfill)
-MIMIR_USERNAME=<your-mimir-username>
-MIMIR_PASSWORD=<your-mimir-password>
-
 # Grafana Admin
 GRAFANA_ADMIN_PASSWORD=<your-grafana-admin-password>

--- a/05-m3coordinator-deployment.yaml
+++ b/05-m3coordinator-deployment.yaml
@@ -0,0 +1,63 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: m3coordinator
+  namespace: m3db
+  labels:
+    app.kubernetes.io/name: m3coordinator
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: m3coordinator
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: m3coordinator
+    spec:
+      containers:
+        - name: m3coordinator
+          image: quay.io/m3db/m3coordinator:v1.5.0
+          args:
+            - -f
+            - /etc/m3coordinator/m3coordinator.yml
+          ports:
+            - name: api
+              containerPort: 7201
+            - name: metrics
+              containerPort: 7203
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 7201
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 7201
+            initialDelaySeconds: 5
+            periodSeconds: 5
+          volumeMounts:
+            - name: config
+              mountPath: /etc/m3coordinator
+      volumes:
+        - name: config
+          configMap:
+            name: m3coordinator-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: m3coordinator
+  namespace: m3db
+spec:
+  selector:
+    app.kubernetes.io/name: m3coordinator
+  ports:
+    - name: api
+      port: 7201
+      targetPort: api
+    - name: metrics
+      port: 7203
+      targetPort: metrics
--- a/05-m3coordinator.yaml
+++ b/05-m3coordinator.yaml
@@ -1,70 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: m3coordinator-config
-  namespace: m3db
-  labels:
-    app.kubernetes.io/name: m3coordinator
-data:
-  m3coordinator.yml: |
-    listenAddress: 0.0.0.0:7201
-
-    logging:
-      level: info
-
-    metrics:
-      scope:
-        prefix: coordinator
-      prometheus:
-        handlerPath: /metrics
-        listenAddress: 0.0.0.0:7203
-      sanitization: prometheus
-      samplingRate: 1.0
-
-    tagOptions:
-      idScheme: quoted
-
-    clusters:
-      - namespaces:
-          - namespace: default
-            type: unaggregated
-            retention: 720h
-          - namespace: agg_1m_60d
-            type: aggregated
-            retention: 1440h
-            resolution: 1m
-          - namespace: agg_1h_1y
-            type: aggregated
-            retention: 8760h
-            resolution: 1h
-        client:
-          config:
-            service:
-              env: default_env
-              zone: embedded
-              service: m3db
-              cacheDir: /var/lib/m3kv
-              etcdClusters:
-                - zone: embedded
-                  endpoints:
-                    - http://etcd-0.etcd.m3db.svc.cluster.local:2379
-                    - http://etcd-1.etcd.m3db.svc.cluster.local:2379
-                    - http://etcd-2.etcd.m3db.svc.cluster.local:2379
-          writeConsistencyLevel: majority
-          readConsistencyLevel: unstrict_majority
-
-    downsample:
-      rules:
-        mappingRules:
-          - name: "1min for 60 days"
-            filter: "__name__:*"
-            aggregations: ["Last"]
-            storagePolicies:
-              - resolution: 1m
-                retention: 1440h
-          - name: "1hour for 1 year"
-            filter: "__name__:*"
-            aggregations: ["Last"]
-            storagePolicies:
-              - resolution: 1h
-                retention: 8760h
--- a/06-init-and-pdb.yaml
+++ b/06-init-and-pdb.yaml
@@ -63,7 +63,7 @@ spec:
              COORD="http://m3coordinator.m3db.svc.cluster.local:7201"

              echo "=== Waiting for coordinator to be healthy ==="
-              until curl -sf "${COORD}/api/v1/services/m3db/health"; do
+              until curl -sf "${COORD}/health"; do
                echo "Coordinator not ready yet, retrying in 5s..."
                sleep 5
              done
@@ -121,13 +121,13 @@ spec:
                    "repairEnabled": false,
                    "retentionOptions": {
                      "retentionPeriodDuration": "48h",
-                      "blockSizeDuration": "2h",
+                      "blockSizeDuration": "1h",
                      "bufferFutureDuration": "10m",
                      "bufferPastDuration": "10m"
                    },
                    "indexOptions": {
                      "enabled": true,
-                      "blockSizeDuration": "2h"
+                      "blockSizeDuration": "1h"
                    }
                  }
                }'
@@ -146,13 +146,13 @@ spec:
                    "snapshotEnabled": true,
                    "retentionOptions": {
                      "retentionPeriodDuration": "720h",
-                      "blockSizeDuration": "12h",
+                      "blockSizeDuration": "1h",
                      "bufferFutureDuration": "10m",
                      "bufferPastDuration": "10m"
                    },
                    "indexOptions": {
                      "enabled": true,
-                      "blockSizeDuration": "12h"
+                      "blockSizeDuration": "1h"
                    },
                    "aggregationOptions": {
                      "aggregations": [
@@ -181,13 +181,13 @@ spec:
                    "snapshotEnabled": true,
                    "retentionOptions": {
                      "retentionPeriodDuration": "8760h",
-                      "blockSizeDuration": "24h",
+                      "blockSizeDuration": "1h",
                      "bufferFutureDuration": "10m",
                      "bufferPastDuration": "10m"
                    },
                    "indexOptions": {
                      "enabled": true,
-                      "blockSizeDuration": "24h"
+                      "blockSizeDuration": "1h"
                    },
                    "aggregationOptions": {
                      "aggregations": [
--- a/README.md
+++ b/README.md
@@ -44,11 +44,13 @@ Internet → Vultr LoadBalancer → Traefik (TLS + basic auth) → m3coordinator

 ## Retention Tiers

-| Namespace      | Resolution | Retention | Use Case                  |
-|----------------|-----------|-----------|---------------------------|
-| `default`      | raw       | 48h       | Real-time queries         |
-| `agg_10s_30d`  | 10s       | 30 days   | Recent dashboards         |
-| `agg_1m_1y`    | 1m        | 1 year    | Long-term trends/capacity |
+All namespaces use **1h block size** — the sweet spot for M3DB. Smaller blocks mean faster queries, faster flushes, and less memory pressure during compaction. See [Why Backfill Doesn't Work](#why-backfill-doesnt-work) for why larger blocks were a disaster.
+
+| Namespace      | Resolution | Retention | Block Size | Use Case                  |
+|----------------|-----------|-----------|------------|---------------------------|
+| `default`      | raw       | 48h       | 1h         | Real-time queries         |
+| `agg_10s_30d`  | 10s       | 30 days   | 1h         | Recent dashboards         |
+| `agg_1m_1y`    | 1m        | 1 year    | 1h         | Long-term trends/capacity |

 ## Deployment

@@ -96,13 +98,13 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/

 # Create namespaces
 kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
-  -H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}'
+  -H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"}}}'

 kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
-  -H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"12h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"12h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'
+  -H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'

 kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
-  -H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
+  -H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'

 # Wait for bootstrapping to complete (check shard state = AVAILABLE)
 kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
@@ -250,6 +252,40 @@ remote_write:
 - **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
 - **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.

+## Why Backfill Doesn't Work
+
+**TL;DR: M3DB is not designed for historical data import. Don't try it.**
+
+M3DB is a time-series database optimized for real-time ingestion and sequential writes. Backfilling — writing data with timestamps in the past — fights the fundamental architecture at every turn:
+
+### The Problems
+
+1. **`bufferPast` is a hard gate.** M3DB rejects writes whose timestamps fall outside the `bufferPast` window (default: 10m). To write data from 3 weeks ago, you need `bufferPast=504h` (21 days). This setting is **immutable** on existing namespaces — you have to create entirely new namespaces just for backfill, doubling your operational complexity.
+
+2. **Massive block sizes were required.** To make the backfill namespaces work with `bufferPast=504h`, block sizes had to be enormous (30+ day blocks). This defeated the entire point of M3DB's time-partitioned storage — blocks that large cause extreme memory pressure, slow compaction, and bloated index lookups.
+
+3. **Downsample pipeline ignores historical data.** M3DB's downsample coordinator only processes new writes in real-time. Backfilled data written to `default_backfill` namespaces never gets downsampled into aggregated namespaces, so your long-term retention tiers have gaps.
+
+4. **No transaction boundaries.** Each backfill write is an individual operation. Writing 12M+ samples means 12M+ individual writes with no batching semantics. If one fails, there's no rollback, no retry from a checkpoint — you get partial data with no easy way to detect or fix gaps.
+
+5. **Compaction and flush chaos.** M3DB expects data to flow sequentially through commitlog → flush → compact. Backfill dumps data out of order, causing the background compaction to thrash, consuming CPU and I/O for blocks that may never be queried again.
+
+### What We Tried
+
+- Created `default_backfill`, `agg_10s_backfill`, `agg_1m_backfill` namespaces with `bufferPast=504h`
+- Increased block sizes to 24h–30d to accommodate the large bufferPast
+- Wrote 12M+ samples from Mimir to M3DB over multiple runs
+- Result: Data landed, but the operational cost was catastrophic — huge blocks, no downsampling, and the cluster was unstable
+
+### What To Do Instead
+
+- **Start fresh.** Configure M3DB with sane block sizes (1h) from day one and let it accumulate data naturally via Prometheus remote_write.
+- **Accept the gap.** Historical data lives in Mimir (or wherever it was before). Query Mimir for old data, M3DB for new data.
+- **Dual-write during migration.** Write to both systems simultaneously until M3DB's retention catches up.
+- **If you absolutely need old data in M3DB**, accept that you're doing a one-time migration and build tooling around the constraints — but know that it's a project, not a script.
+
+---
+
 ## Useful Commands

 ```bash
--- a/backfill/BACKFILL_RUNBOOK.md
+++ b/backfill/BACKFILL_RUNBOOK.md
@@ -1,171 +0,0 @@
-# M3DB Backfill Runbook (Revised)
-
-## Context
-
-Backfilling ~3 weeks of vLLM + DCGM metrics from Mimir to M3DB.
-
-**Blocker discovered:** `bufferPast` is immutable on existing namespaces. Downsample pipeline rejects historical writes.
-
-**Solution:** Create new backfill namespaces with `bufferPast=504h` (21 days).
-
---
-
-## Step 1 — Create Backfill Namespaces
-
-```bash
-COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
-
-# default_backfill: 7d retention, 21d bufferPast
-curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "name": "default_backfill",
-    "options": {
-      "retentionOptions": {
-        "retentionPeriodDuration": "168h",
-        "blockSizeDuration": "2h",
-        "bufferFutureDuration": "10m",
-        "bufferPastDuration": "504h"
-      }
-    }
-  }'
-
-# agg_10s_backfill: 90d retention, 10s resolution, 21d bufferPast
-curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "name": "agg_10s_backfill",
-    "options": {
-      "retentionOptions": {
-        "retentionPeriodDuration": "2160h",
-        "blockSizeDuration": "24h",
-        "bufferFutureDuration": "10m",
-        "bufferPastDuration": "504h"
-      }
-    },
-    "aggregationOptions": {
-      "aggregations": [{
-        "aggregated": true,
-        "attributes": {
-          "resolutionNanos": "10000000000",
-          "downsampleOptions": {"all": true}
-        }
-      }]
-    }
-  }'
-
-# agg_1m_backfill: 1y retention, 1m resolution, 21d bufferPast
-curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "name": "agg_1m_backfill",
-    "options": {
-      "retentionOptions": {
-        "retentionPeriodDuration": "8760h",
-        "blockSizeDuration": "24h",
-        "bufferFutureDuration": "10m",
-        "bufferPastDuration": "504h"
-      }
-    },
-    "aggregationOptions": {
-      "aggregations": [{
-        "aggregated": true,
-        "attributes": {
-          "resolutionNanos": "60000000000",
-          "downsampleOptions": {"all": true}
-        }
-      }]
-    }
-  }'
-```
-
---
-
-## Step 2 — Update Coordinator ConfigMap
-
-Add new namespaces to `m3coordinator-config`:
-
-```yaml
-clusters:
-  - namespaces:
-      - namespace: default
-        type: unaggregated
-        retention: 168h
-      - namespace: default_backfill
-        type: unaggregated
-        retention: 168h
-      - namespace: agg_10s_30d
-        type: aggregated
-        retention: 2160h
-        resolution: 10s
-      - namespace: agg_10s_backfill
-        type: aggregated
-        retention: 2160h
-        resolution: 10s
-      - namespace: agg_1m_1y
-        type: aggregated
-        retention: 8760h
-        resolution: 1m
-      - namespace: agg_1m_backfill
-        type: aggregated
-        retention: 8760h
-        resolution: 1m
-```
-
-Also add downsample rules for backfill namespaces.
-
---
-
-## Step 3 — Restart Coordinators
-
-```bash
-kubectl rollout restart deployment/m3coordinator -n m3db
-kubectl rollout status deployment/m3coordinator -n m3db --timeout=120s
-```
-
---
-
-## Step 4 — Run Backfill
-
-Write directly to `default_backfill` namespace using `__namespace__` label:
-
-```python
-# In the protobuf write request, add label:
-# __namespace__ = "default_backfill"
-```
-
-Or use the coordinator endpoint:
-```
-POST http://m3coordinator:7201/api/v1/prom/remote/write?namespace=default_backfill
-```
-
-Backfill time range: `2026-03-11T00:00:00Z` to `2026-04-01T00:00:00Z`
-
---
-
-## Step 5 — Verify
-
-```bash
-curl -sS "http://m3coordinator:7201/api/v1/query" \
-  --data-urlencode 'query=vllm:prompt_tokens_total' \
-  --data-urlencode 'time=2026-03-20T12:00:00Z'
-```
-
---
-
-## Step 6 — Revert bufferPast (After Backfill)
-
-```bash
-# After backfill complete, shrink bufferPast back to 10m
-# (Only retentionPeriod is mutable, so this requires namespace recreation)
-# OR: Leave as-is since it's a backfill-only namespace
-```
-
---
-
-## Performance Notes
-
- M3DB has been fast so far
- New namespaces won't impact existing query performance
- Queries can fan out to both old and new namespaces in parallel
- After backfill, consider consolidating (optional)
--- a/backfill/README.md
+++ b/backfill/README.md
@@ -1,87 +0,0 @@
-# M3DB Backfill Tools
-
-Scripts to backfill historical metrics from Mimir to M3DB.
-
-## Prerequisites
-
-Copy `.env` and set credentials:
-```bash
-cp .env.example .env
-# Edit .env with your credentials
-```
-
-Required environment variables:
- `MIMIR_USERNAME` - Mimir API username
- `MIMIR_PASSWORD` - Mimir API password
-
-## Files
-
-| File | Purpose |
-|------|---------|
-| `backfill.py` | Main backfill script — pulls from Mimir, writes to M3DB |
-| `backfill-gap.py` | Lightweight script for filling specific time gaps |
-| `backfill-pod.yaml` | Kubernetes pod manifest for running backfill |
-| `BACKFILL_RUNBOOK.md` | Detailed runbook with lessons learned |
-| `test-metrics.py` | Test script for verifying data flow |
-
-## Quick Usage
-
-### Full Backfill
-
-```bash
-# Edit START_TS and END_TS in backfill.py first
-# Format: Unix timestamps (seconds since epoch)
-
-# Create configmap and run
-kubectl create configmap backfill-script --from-file=backfill.py=backfill.py -n m3db
-kubectl apply -f backfill-pod.yaml
-kubectl logs -f backfill -n m3db
-```
-
-### Fill a Specific Gap
-
-Edit `backfill-gap.py` to set your time range:
-
-```python
-START_TS = 1774175400  # Unix timestamp
-END_TS = 1774243800    # Unix timestamp
-```
-
-Then run:
-
-```bash
-kubectl create configmap backfill-gap-script --from-file=backfill-gap.py=backfill-gap.py -n m3db
-kubectl apply -f backfill-gap-pod.yaml
-kubectl logs -f backfill-gap -n m3db
-```
-
-## Timestamp Helpers
-
-```bash
-# Convert date to Unix timestamp
-date -u -d '2026-03-22 10:30:00' +%s
-
-# Convert Unix timestamp to date
-date -u -d @1774175400
-```
-
-## Requirements
-
- Mimir credentials (in script)
- M3DB coordinator endpoint: `http://m3coordinator.m3db.svc.cluster.local:7201`
- `bufferPast` must be >= the age of data you're backfilling (currently 21 days)
-
-## Metrics Backfilled
-
- `vllm:prompt_tokens_total`
- `vllm:generation_tokens_total`
- `DCGM_FI_DEV_GPU_UTIL`
-
-## Cleanup
-
-After backfill completes:
-
-```bash
-kubectl delete pod backfill -n m3db
-kubectl delete configmap backfill-script -n m3db
-```
--- a/backfill/backfill-gap-pod.yaml
+++ b/backfill/backfill-gap-pod.yaml
@@ -1,18 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: backfill-gap
-  namespace: m3db
-spec:
-  restartPolicy: Never
-  volumes:
-    - name: script
-      configMap:
-        name: backfill-gap-script
-  containers:
-    - name: backfill
-      image: python:3.11-slim
-      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-gap.py"]
-      volumeMounts:
-        - name: script
-          mountPath: /scripts
--- a/backfill/backfill-gap.py
+++ b/backfill/backfill-gap.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""Quick backfill for April 1 gap (10:44-11:50 UTC)"""
-import struct
-import urllib.request
-import urllib.error
-import urllib.parse
-import json
-import ssl
-import snappy
-import base64
-
-# Read credentials from environment (see .env)
-import os
-MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
-MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
-MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
-M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
-
-START_TS = 1774175400  # 2026-03-22T10:30:00Z
-END_TS = 1774243800    # 2026-03-23T05:30:00Z
-STEP = "10s"
-
-METRICS = ["vllm:prompt_tokens_total", "vllm:generation_tokens_total", "DCGM_FI_DEV_GPU_UTIL"]
-
-def enc(v):
-    b = v & 0x7f
-    v >>= 7
-    r = b""
-    while v:
-        r += bytes([0x80 | b])
-        b = v & 0x7f
-        v >>= 7
-    return r + bytes([b])
-
-def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
-def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
-
-def build_ts(labels, samples):
-    ts = b""
-    for n, v in labels.items():
-        l = es(1, n.encode()) + es(2, v.encode())
-        ts += enc((1<<3)|2) + enc(len(l)) + l
-    for t_ms, val in samples:
-        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
-        ts += enc((2<<3)|2) + enc(len(s)) + s
-    return ts
-
-def ssl_ctx():
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
-    return ctx
-
-def mimir_req(path):
-    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
-    req = urllib.request.Request(f"{MIMIR_URL}{path}")
-    req.add_header("Authorization", f"Basic {auth}")
-    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
-    return json.loads(resp.read().decode())
-
-def write_m3db(data):
-    c = snappy.compress(data)
-    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
-    req.add_header("Content-Type", "application/x-protobuf")
-    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
-    req.add_header("Content-Encoding", "snappy")
-    try:
-        urllib.request.urlopen(req, timeout=300)
-        return True
-    except urllib.error.HTTPError as e:
-        print(f"  ERROR {e.code}: {e.read().decode()[:100]}")
-        return False
-
-print(f"Filling gap: {START_TS} to {END_TS}")
-total = 0
-
-for metric in METRICS:
-    print(f"{metric}...", end=" ", flush=True)
-    path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={START_TS}&end={END_TS}&step={STEP}"
-    data = mimir_req(path)
-    if data["status"] != "success":
-        print("failed")
-        continue
-    series = data["data"]["result"]
-    samples = sum(len(s["values"]) for s in series)
-    if samples > 0:
-        wr = b""
-        for s in series:
-            labels = dict(s["metric"])
-            labels["cluster"] = "serverless-inference-cluster"
-            pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
-            ts = build_ts(labels, pts)
-            wr += enc((1<<3)|2) + enc(len(ts)) + ts
-        if write_m3db(wr):
-            print(f"{samples} samples written")
-            total += samples
-    else:
-        print("no data")
-
-print(f"Done! Total: {total} samples")
--- a/backfill/backfill-massive-pod.yaml
+++ b/backfill/backfill-massive-pod.yaml
@@ -1,18 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: backfill-massive
-  namespace: m3db
-spec:
-  restartPolicy: Never
-  volumes:
-    - name: script
-      configMap:
-        name: backfill-massive-script
-  containers:
-    - name: backfill
-      image: python:3.11-slim
-      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-massive.py"]
-      volumeMounts:
-        - name: script
-          mountPath: /scripts
--- a/backfill/backfill-massive.py
+++ b/backfill/backfill-massive.py
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-"""
-Massive backfill: March 12 - April 1, 2026
-Writes ONLY to 'default' namespace (raw data)
-Overlapping chunks - no gaps!
-"""
-import struct
-import urllib.request
-import urllib.error
-import urllib.parse
-import json
-import ssl
-import snappy
-import base64
-import time
-
-# Read credentials from environment (see .env)
-import os
-MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
-MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
-MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
-M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
-
-# March 12 to April 1 (full range)
-START_TS = 1773273600   # March 12 00:00 UTC
-END_TS = 1775052000     # April 1 14:00 UTC
-CHUNK_HOURS = 4         # 4-hour chunks
-OVERLAP_MINUTES = 30    # 30-min overlap between chunks
-STEP = "10s"
-
-METRICS = [
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "DCGM_FI_DEV_GPU_UTIL",
-]
-
-def enc(v):
-    b = v & 0x7f
-    v >>= 7
-    r = b""
-    while v:
-        r += bytes([0x80 | b])
-        b = v & 0x7f
-        v >>= 7
-    return r + bytes([b])
-
-def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
-def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
-
-def build_ts(labels, samples):
-    ts = b""
-    for n, v in labels.items():
-        l = es(1, n.encode()) + es(2, v.encode())
-        ts += enc((1<<3)|2) + enc(len(l)) + l
-    for t_ms, val in samples:
-        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
-        ts += enc((2<<3)|2) + enc(len(s)) + s
-    return ts
-
-def ssl_ctx():
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
-    return ctx
-
-def mimir_req(path):
-    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
-    url = f"{MIMIR_URL}{path}"
-    req = urllib.request.Request(url)
-    req.add_header("Authorization", f"Basic {auth}")
-    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
-    return json.loads(resp.read().decode())
-
-def write_m3db(data):
-    c = snappy.compress(data)
-    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
-    req.add_header("Content-Type", "application/x-protobuf")
-    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
-    req.add_header("Content-Encoding", "snappy")
-    # TARGET ONLY DEFAULT NAMESPACE
-    req.add_header("X-M3-Namespace", "default")
-    try:
-        urllib.request.urlopen(req, timeout=300)
-        return True
-    except urllib.error.HTTPError as e:
-        err = e.read().decode()[:200]
-        print(f"  WRITE ERROR {e.code}: {err}")
-        return False
-
-print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
-print(f"Range: March 12 - April 1, 2026")
-print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
-print(f"Metrics: {len(METRICS)}")
-print("="*60)
-
-total_samples = 0
-start_time = time.time()
-chunk_seconds = CHUNK_HOURS * 3600
-overlap_seconds = OVERLAP_MINUTES * 60
-
-for metric in METRICS:
-    print(f"\n{metric}:")
-    metric_samples = 0
-    chunk_num = 0
-    
-    chunk_start = START_TS
-    while chunk_start < END_TS:
-        chunk_end = min(chunk_start + chunk_seconds, END_TS)
-        chunk_num += 1
-        
-        path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
-        
-        try:
-            data = mimir_req(path)
-        except Exception as e:
-            print(f"  Chunk {chunk_num}: QUERY FAILED: {e}")
-            chunk_start = chunk_end - overlap_seconds
-            continue
-        
-        if data.get("status") != "success":
-            print(f"  Chunk {chunk_num}: status={data.get('status')}")
-            chunk_start = chunk_end - overlap_seconds
-            continue
-        
-        series = data["data"]["result"]
-        samples = sum(len(s["values"]) for s in series)
-        
-        if samples == 0:
-            chunk_start = chunk_end - overlap_seconds
-            continue
-        
-        wr = b""
-        for s in series:
-            labels = dict(s["metric"])
-            labels["cluster"] = "serverless-inference-cluster"
-            pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
-            ts = build_ts(labels, pts)
-            wr += enc((1<<3)|2) + enc(len(ts)) + ts
-        
-        if write_m3db(wr):
-            metric_samples += samples
-            hrs = (chunk_end - chunk_start) / 3600
-            print(f"  Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
-        
-        # Next chunk starts with overlap
-        chunk_start = chunk_end - overlap_seconds
-    
-    total_samples += metric_samples
-    print(f"  TOTAL {metric}: {metric_samples:,} samples")
-
-elapsed = time.time() - start_time
-print("="*60)
-print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")
--- a/backfill/backfill-pod.yaml
+++ b/backfill/backfill-pod.yaml
@@ -1,18 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: backfill-v2
-  namespace: m3db
-spec:
-  restartPolicy: Never
-  volumes:
-    - name: script
-      configMap:
-        name: backfill-script-v2
-  containers:
-    - name: backfill
-      image: python:3.11-slim
-      command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill.py"]
-      volumeMounts:
-        - name: script
-          mountPath: /scripts
--- a/backfill/backfill.py
+++ b/backfill/backfill.py
@@ -1,124 +0,0 @@
-#!/usr/bin/env python3
-"""M3DB Backfill - Pull vLLM/DCGM metrics from Mimir and write to M3DB"""
-import struct
-import urllib.request
-import urllib.error
-import urllib.parse
-import json
-import ssl
-import snappy
-import base64
-import sys
-
-print("Starting backfill script...", flush=True)
-
-# Read credentials from environment (see .env)
-import os
-MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
-MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
-MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
-M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
-
-START_TS = 1773187200  # 2026-03-11T00:00:00Z
-END_TS = 1775040000    # 2026-04-01T11:40:00Z (just before node restart)
-STEP = "10s"
-CHUNK_HOURS = 6
-
-METRICS = [
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "DCGM_FI_DEV_GPU_UTIL",
-]
-
-def enc(v):
-    b = v & 0x7f
-    v >>= 7
-    r = b""
-    while v:
-        r += bytes([0x80 | b])
-        b = v & 0x7f
-        v >>= 7
-    return r + bytes([b])
-
-def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
-def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
-
-def build_ts(labels, samples):
-    ts = b""
-    for n, v in labels.items():
-        l = es(1, n.encode()) + es(2, v.encode())
-        ts += enc((1<<3)|2) + enc(len(l)) + l
-    for t_ms, val in samples:
-        s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
-        ts += enc((2<<3)|2) + enc(len(s)) + s
-    return ts
-
-def ssl_ctx():
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
-    return ctx
-
-def mimir_req(path):
-    auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
-    req = urllib.request.Request(f"{MIMIR_URL}{path}")
-    req.add_header("Authorization", f"Basic {auth}")
-    resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
-    return json.loads(resp.read().decode())
-
-def write_m3db(data):
-    c = snappy.compress(data)
-    req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
-    req.add_header("Content-Type", "application/x-protobuf")
-    req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
-    req.add_header("Content-Encoding", "snappy")
-    try:
-        resp = urllib.request.urlopen(req, timeout=300)
-        return True
-    except urllib.error.HTTPError as e:
-        print(f"  ERROR {e.code}: {e.read().decode()[:100]}", flush=True)
-        return False
-
-print(f"Time range: {START_TS} to {END_TS}", flush=True)
-total = 0
-
-for metric in METRICS:
-    print(f"\n{metric}...", flush=True)
-    metric_total = 0
-    chunk_start = START_TS
-    chunks_done = 0
-
-    while chunk_start < END_TS:
-        chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
-        try:
-            path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
-            data = mimir_req(path)
-            if data["status"] != "success":
-                chunk_start = chunk_end
-                continue
-
-            series = data["data"]["result"]
-            samples = sum(len(s["values"]) for s in series)
-            if samples > 0:
-                wr = b""
-                for s in series:
-                    labels = dict(s["metric"])
-                    labels["cluster"] = "serverless-inference-cluster"
-                    pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
-                    ts = build_ts(labels, pts)
-                    wr += enc((1<<3)|2) + enc(len(ts)) + ts
-                if write_m3db(wr):
-                    metric_total += samples
-                    chunks_done += 1
-                    if chunks_done % 10 == 0:
-                        print(f"  {chunks_done} chunks, {metric_total} samples...", flush=True)
-
-        except Exception as e:
-            print(f"  Chunk error: {e}", flush=True)
-
-        chunk_start = chunk_end
-
-    print(f"  Done: {metric_total} samples", flush=True)
-    total += metric_total
-
-print(f"\nBackfill complete! Total: {total} samples", flush=True)
--- a/backfill/test-metrics.py
+++ b/backfill/test-metrics.py
@@ -1,245 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for M3DB read/write functionality.
-Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]
-
-Examples:
-    python3 test-metrics.py https://m3db.vultrlabs.dev example example
-    python3 test-metrics.py http://192.168.1.100:7201
-"""
-
-import sys
-import time
-import random
-import requests
-
-
-def main():
-    if len(sys.argv) < 2:
-        print("Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]")
-        print("Example: python3 test-metrics.py https://m3db.vultrlabs.dev example example")
-        print("         python3 test-metrics.py http://192.168.1.100:7201")
-        sys.exit(1)
-
-    base_url = sys.argv[1].rstrip('/')
-    username = sys.argv[2] if len(sys.argv) > 2 else None
-    password = sys.argv[3] if len(sys.argv) > 3 else None
-
-    # Setup auth if provided
-    auth = (username, password) if username and password else None
-
-    print(f"=== M3DB Metrics Test ===")
-    print(f"URL: {base_url}")
-    if auth:
-        print(f"Auth: {username}:***")
-    print()
-
-    # Check coordinator health
-    print("=== Health Check ===")
-    health_url = f"{base_url}/health"
-    try:
-        resp = requests.get(health_url, auth=auth, timeout=10)
-        if resp.status_code == 200:
-            print(f"✓ Coordinator healthy")
-        elif resp.status_code == 401:
-            print(f"✗ Authentication required. Provide username and password.")
-            sys.exit(1)
-        else:
-            print(f"✗ Coordinator unhealthy: {resp.status_code}")
-            sys.exit(1)
-    except requests.exceptions.RequestException as e:
-        print(f"✗ Failed to connect: {e}")
-        sys.exit(1)
-
-    # Check placement
-    print()
-    print("=== Placement ===")
-    placement_url = f"{base_url}/api/v1/services/m3db/placement"
-    try:
-        resp = requests.get(placement_url, auth=auth, timeout=10)
-        if resp.status_code == 200:
-            placement = resp.json()
-            instances = placement.get("placement", {}).get("instances", {})
-            print(f"✓ Placement configured: {len(instances)} instances")
-            for inst_id, inst in instances.items():
-                print(f"  - {inst_id}: {inst.get('endpoint', 'unknown')}")
-        else:
-            print(f"✗ Placement not ready: {resp.status_code}")
-            print(f"  Response: {resp.text}")
-    except requests.exceptions.RequestException as e:
-        print(f"✗ Failed to get placement: {e}")
-
-    # Check namespaces
-    print()
-    print("=== Namespaces ===")
-    namespace_url = f"{base_url}/api/v1/services/m3db/namespace"
-    try:
-        resp = requests.get(namespace_url, auth=auth, timeout=10)
-        if resp.status_code == 200:
-            ns_data = resp.json()
-            namespaces = ns_data.get("namespaces", {})
-            print(f"✓ Namespaces configured: {len(namespaces)}")
-            for ns_name in namespaces.keys():
-                print(f"  - {ns_name}")
-        else:
-            print(f"✗ Namespaces not ready: {resp.status_code}")
-    except requests.exceptions.RequestException as e:
-        print(f"✗ Failed to get namespaces: {e}")
-
-    # Query test
-    print()
-    print("=== Query Test ===")
-    query_url = f"{base_url}/api/v1/query"
-    try:
-        resp = requests.get(query_url, params={"query": "up"}, auth=auth, timeout=10)
-        if resp.status_code == 200:
-            result = resp.json()
-            status = result.get("status")
-            print(f"✓ Query returned: {status}")
-            data = result.get("data", {}).get("result", [])
-            print(f"  Results: {len(data)} series")
-        else:
-            print(f"✗ Query failed: {resp.status_code}")
-    except requests.exceptions.RequestException as e:
-        print(f"✗ Query failed: {e}")
-
-    # Write test using Prometheus remote_write
-    print()
-    print("=== Write Test ===")
-    print("Writing metrics via Prometheus remote_write format...")
-
-    try:
-        import struct
-        import snappy  # pip install python-snappy
-    except ImportError:
-        print("✗ Missing dependencies for write test")
-        print("  Install with: pip install python-snappy")
-        print("  Skipping write test...")
-        print()
-        print("=== Test complete (read-only) ===")
-        return
-
-    write_url = f"{base_url}/api/v1/prom/remote/write"
-
-    def encode_varint(n):
-        """Encode a varint"""
-        result = []
-        while n > 127:
-            result.append((n & 0x7F) | 0x80)
-            n >>= 7
-        result.append(n)
-        return bytes(result)
-
-    def encode_string(field_num, s):
-        """Encode a string field in protobuf"""
-        data = s.encode('utf-8')
-        tag = (field_num << 3) | 2
-        return bytes([tag]) + encode_varint(len(data)) + data
-
-    def encode_double(field_num, value):
-        """Encode a double field in protobuf"""
-        tag = (field_num << 3) | 1
-        return bytes([tag]) + struct.pack('<d', value)
-
-    def encode_int64(field_num, value):
-        """Encode an int64 field in protobuf (as varint)"""
-        tag = (field_num << 3) | 0
-        return bytes([tag]) + encode_varint(value)
-
-    def encode_label(name, value):
-        """Encode a single Label message"""
-        return encode_string(1, name) + encode_string(2, value)
-
-    def write_metric(name, value, labels_dict):
-        """Write a metric with custom labels"""
-        ts_ms = int(time.time() * 1000)
-
-        # Build all labels as repeated Label messages
-        labels_data = b''
-
-        # __name__ label first
-        labels_data += bytes([0x0a]) + encode_varint(len(encode_label("__name__", name))) + encode_label("__name__", name)
-
-        # Then custom labels
-        for k, v in labels_dict.items():
-            label_msg = encode_label(k, v)
-            labels_data += bytes([0x0a]) + encode_varint(len(label_msg)) + label_msg
-
-        # Build Sample (field 2 in TimeSeries)
-        sample = encode_double(1, float(value)) + encode_int64(2, ts_ms)
-
-        # Build TimeSeries
-        ts_encoded = labels_data + bytes([0x12]) + encode_varint(len(sample)) + sample
-
-        # Build WriteRequest
-        write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
-
-        # Compress with snappy
-        compressed = snappy.compress(write_req)
-
-        headers = {
-            "Content-Encoding": "snappy",
-            "Content-Type": "application/x-protobuf",
-            "X-Prometheus-Remote-Write-Version": "0.1.0"
-        }
-
-        resp = requests.post(write_url, data=compressed, headers=headers, auth=auth, timeout=10)
-        return resp.status_code
-
-    # Write test metrics with tenant labels
-    print()
-    tenants = [
-        {"tenant": "test-tenant", "service": "api", "env": "test"},
-    ]
-
-    ts = int(time.time())
-    for labels in tenants:
-        metric_name = f"test_metric_{ts}"
-        metric_value = random.randint(1, 100)
-
-        status = write_metric(metric_name, metric_value, labels)
-        print(f"✓ Wrote: {metric_name} = {metric_value}")
-        print(f"  Labels: tenant={labels.get('tenant')}, service={labels.get('service')}, env={labels.get('env')}")
-
-    # Wait and query back
-    time.sleep(2)
-
-    print()
-    print("=== Read Back Test ===")
-    try:
-        resp = requests.get(query_url, params={"query": metric_name}, auth=auth, timeout=10)
-        if resp.status_code == 200:
-            result = resp.json()
-            data = result.get("data", {}).get("result", [])
-            if data:
-                print(f"✓ Metric found!")
-                for series in data:
-                    metric = series.get("metric", {})
-                    values = series.get("values", series.get("value", []))
-                    print(f"  Labels: {metric}")
-                    print(f"  Values: {values}")
-            else:
-                print(f"✗ Metric not found (may take a moment to index)")
-        else:
-            print(f"✗ Query failed: {resp.status_code}")
-    except requests.exceptions.RequestException as e:
-        print(f"✗ Query failed: {e}")
-
-    print()
-    print("=== Multi-Tenancy Query Examples ===")
-    print()
-    print("Query by tenant:")
-    print(f"  curl -u user:pass '{base_url}/api/v1/query?query={{tenant=\"test-tenant\"}}'")
-    print()
-    print("Query by service:")
-    print(f"  curl -u user:pass '{base_url}/api/v1/query?query={{service=\"api\"}}'")
-    print()
-    print("Query by env:")
-    print(f"  curl -u user:pass '{base_url}/api/v1/query?query={{env=\"test\"}}'")
-    print()
-
-    print("=== Test complete ===")
-
-
-if __name__ == "__main__":
-    main()
--- a/backfill/test-metrics.sh
+++ b/backfill/test-metrics.sh
@@ -1,93 +0,0 @@
-#!/bin/bash
-#
-# Simple M3DB connectivity test
-# Usage: ./test-metrics.sh <BASE_URL> [USERNAME] [PASSWORD]
-#
-# Examples:
-#   ./test-metrics.sh https://m3db.vultrlabs.dev example example
-#   ./test-metrics.sh http://192.168.1.100:7201
-#
-
-set -e
-
-BASE_URL="${1:-}"
-USERNAME="${2:-}"
-PASSWORD="${3:-}"
-
-if [ -z "$BASE_URL" ]; then
-    echo "Usage: $0 <BASE_URL> [USERNAME] [PASSWORD]"
-    echo "Example: $0 https://m3db.vultrlabs.dev example example"
-    echo "         $0 http://192.168.1.100:7201"
-    exit 1
-fi
-
-# Remove trailing slash if present
-BASE_URL="${BASE_URL%/}"
-
-# Build auth flag if credentials provided
-AUTH_FLAG=""
-if [ -n "$USERNAME" ] && [ -n "$PASSWORD" ]; then
-    AUTH_FLAG="-u ${USERNAME}:${PASSWORD}"
-fi
-
-echo "=== M3DB Connectivity Test ==="
-echo "Target: ${BASE_URL}"
-if [ -n "$AUTH_FLAG" ]; then
-    echo "Auth: ${USERNAME}:***"
-fi
-echo ""
-
-# Health check
-echo "1. Coordinator Health"
-if curl -sf $AUTH_FLAG "${BASE_URL}/health" > /dev/null 2>&1; then
-    echo "   ✓ Healthy"
-else
-    echo "   ✗ Unhealthy or unreachable"
-    exit 1
-fi
-
-# Placement
-echo ""
-echo "2. Placement (cluster topology)"
-PLACEMENT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
-INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0")
-if [ "$INSTANCE_COUNT" -gt 0 ]; then
-    echo "   ✓ $INSTANCE_COUNT instances in placement"
-    echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); [print(f'     - {k}') for k in d.keys()]" 2>/dev/null || true
-else
-    echo "   ✗ No placement configured (run init job)"
-fi
-
-# Namespaces
-echo ""
-echo "3. Namespaces (retention policies)"
-NAMESPACES=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
-NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0")
-if [ "$NS_COUNT" -gt 0 ]; then
-    echo "   ✓ $NS_COUNT namespaces configured"
-    echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); [print(f'     - {k}') for k in d.keys()]" 2>/dev/null || true
-else
-    echo "   ✗ No namespaces configured (run init job)"
-fi
-
-# Query test
-echo ""
-echo "4. Query Test (PromQL)"
-QUERY_RESULT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
-STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
-if [ "$STATUS" = "success" ]; then
-    RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0")
-    echo "   ✓ Query returned: $RESULT_COUNT series"
-else
-    echo "   ✗ Query failed"
-fi
-
-# Write test (requires protobuf + snappy, so just note it)
-echo ""
-echo "5. Write Test"
-echo "   Note: Prometheus remote_write requires protobuf + snappy encoding."
-echo "   Use test-metrics.py for full write/read verification."
-echo "   Install: pip install python-snappy requests"
-
-echo ""
-echo "=== Test Complete ==="
--- a/kustomization.yaml
+++ b/kustomization.yaml
@@ -7,5 +7,5 @@ resources:
  - 02-etcd.yaml
  - 03-configmaps.yaml
  - 04-m3dbnode.yaml
-  - 05-m3coordinator.yaml
+  - 05-m3coordinator-deployment.yaml
  - 06-init-and-pdb.yaml