diff --git a/.env.example b/.env.example index 04444dd..58e0547 100644 --- a/.env.example +++ b/.env.example @@ -6,10 +6,6 @@ M3DB_USERNAME= M3DB_PASSWORD= -# Mimir (source for backfill) -MIMIR_USERNAME= -MIMIR_PASSWORD= - # Grafana Admin GRAFANA_ADMIN_PASSWORD= diff --git a/05-m3coordinator-deployment.yaml b/05-m3coordinator-deployment.yaml new file mode 100644 index 0000000..8759cd9 --- /dev/null +++ b/05-m3coordinator-deployment.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: m3coordinator + namespace: m3db + labels: + app.kubernetes.io/name: m3coordinator +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: m3coordinator + template: + metadata: + labels: + app.kubernetes.io/name: m3coordinator + spec: + containers: + - name: m3coordinator + image: quay.io/m3db/m3coordinator:v1.5.0 + args: + - -f + - /etc/m3coordinator/m3coordinator.yml + ports: + - name: api + containerPort: 7201 + - name: metrics + containerPort: 7203 + livenessProbe: + httpGet: + path: /health + port: 7201 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 7201 + initialDelaySeconds: 5 + periodSeconds: 5 + volumeMounts: + - name: config + mountPath: /etc/m3coordinator + volumes: + - name: config + configMap: + name: m3coordinator-config +--- +apiVersion: v1 +kind: Service +metadata: + name: m3coordinator + namespace: m3db +spec: + selector: + app.kubernetes.io/name: m3coordinator + ports: + - name: api + port: 7201 + targetPort: api + - name: metrics + port: 7203 + targetPort: metrics diff --git a/05-m3coordinator.yaml b/05-m3coordinator.yaml deleted file mode 100644 index d077bfd..0000000 --- a/05-m3coordinator.yaml +++ /dev/null @@ -1,70 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: m3coordinator-config - namespace: m3db - labels: - app.kubernetes.io/name: m3coordinator -data: - m3coordinator.yml: | - listenAddress: 0.0.0.0:7201 - - logging: - level: info - - metrics: - scope: - prefix: coordinator - prometheus: - handlerPath: /metrics - listenAddress: 0.0.0.0:7203 - sanitization: prometheus - samplingRate: 1.0 - - tagOptions: - idScheme: quoted - - clusters: - - namespaces: - - namespace: default - type: unaggregated - retention: 720h - - namespace: agg_1m_60d - type: aggregated - retention: 1440h - resolution: 1m - - namespace: agg_1h_1y - type: aggregated - retention: 8760h - resolution: 1h - client: - config: - service: - env: default_env - zone: embedded - service: m3db - cacheDir: /var/lib/m3kv - etcdClusters: - - zone: embedded - endpoints: - - http://etcd-0.etcd.m3db.svc.cluster.local:2379 - - http://etcd-1.etcd.m3db.svc.cluster.local:2379 - - http://etcd-2.etcd.m3db.svc.cluster.local:2379 - writeConsistencyLevel: majority - readConsistencyLevel: unstrict_majority - - downsample: - rules: - mappingRules: - - name: "1min for 60 days" - filter: "__name__:*" - aggregations: ["Last"] - storagePolicies: - - resolution: 1m - retention: 1440h - - name: "1hour for 1 year" - filter: "__name__:*" - aggregations: ["Last"] - storagePolicies: - - resolution: 1h - retention: 8760h diff --git a/06-init-and-pdb.yaml b/06-init-and-pdb.yaml index 28432d0..3aa38a9 100644 --- a/06-init-and-pdb.yaml +++ b/06-init-and-pdb.yaml @@ -63,7 +63,7 @@ spec: COORD="http://m3coordinator.m3db.svc.cluster.local:7201" echo "=== Waiting for coordinator to be healthy ===" - until curl -sf "${COORD}/api/v1/services/m3db/health"; do + until curl -sf "${COORD}/health"; do echo "Coordinator not ready yet, retrying in 5s..." sleep 5 done @@ -121,13 +121,13 @@ spec: "repairEnabled": false, "retentionOptions": { "retentionPeriodDuration": "48h", - "blockSizeDuration": "2h", + "blockSizeDuration": "1h", "bufferFutureDuration": "10m", "bufferPastDuration": "10m" }, "indexOptions": { "enabled": true, - "blockSizeDuration": "2h" + "blockSizeDuration": "1h" } } }' @@ -146,13 +146,13 @@ spec: "snapshotEnabled": true, "retentionOptions": { "retentionPeriodDuration": "720h", - "blockSizeDuration": "12h", + "blockSizeDuration": "1h", "bufferFutureDuration": "10m", "bufferPastDuration": "10m" }, "indexOptions": { "enabled": true, - "blockSizeDuration": "12h" + "blockSizeDuration": "1h" }, "aggregationOptions": { "aggregations": [ @@ -181,13 +181,13 @@ spec: "snapshotEnabled": true, "retentionOptions": { "retentionPeriodDuration": "8760h", - "blockSizeDuration": "24h", + "blockSizeDuration": "1h", "bufferFutureDuration": "10m", "bufferPastDuration": "10m" }, "indexOptions": { "enabled": true, - "blockSizeDuration": "24h" + "blockSizeDuration": "1h" }, "aggregationOptions": { "aggregations": [ diff --git a/README.md b/README.md index 54191dd..900dc7f 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,13 @@ Internet → Vultr LoadBalancer → Traefik (TLS + basic auth) → m3coordinator ## Retention Tiers -| Namespace | Resolution | Retention | Use Case | -|----------------|-----------|-----------|---------------------------| -| `default` | raw | 48h | Real-time queries | -| `agg_10s_30d` | 10s | 30 days | Recent dashboards | -| `agg_1m_1y` | 1m | 1 year | Long-term trends/capacity | +All namespaces use **1h block size** — the sweet spot for M3DB. Smaller blocks mean faster queries, faster flushes, and less memory pressure during compaction. See [Why Backfill Doesn't Work](#why-backfill-doesnt-work) for why larger blocks were a disaster. + +| Namespace | Resolution | Retention | Block Size | Use Case | +|----------------|-----------|-----------|------------|---------------------------| +| `default` | raw | 48h | 1h | Real-time queries | +| `agg_10s_30d` | 10s | 30 days | 1h | Recent dashboards | +| `agg_1m_1y` | 1m | 1 year | 1h | Long-term trends/capacity | ## Deployment @@ -96,13 +98,13 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/ # Create namespaces kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \ - -H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}' + -H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"}}}' kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \ - -H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"12h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"12h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}' + -H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}' kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \ - -H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}' + -H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}' # Wait for bootstrapping to complete (check shard state = AVAILABLE) kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health @@ -250,6 +252,40 @@ remote_write: - **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256. - **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`. +## Why Backfill Doesn't Work + +**TL;DR: M3DB is not designed for historical data import. Don't try it.** + +M3DB is a time-series database optimized for real-time ingestion and sequential writes. Backfilling — writing data with timestamps in the past — fights the fundamental architecture at every turn: + +### The Problems + +1. **`bufferPast` is a hard gate.** M3DB rejects writes whose timestamps fall outside the `bufferPast` window (default: 10m). To write data from 3 weeks ago, you need `bufferPast=504h` (21 days). This setting is **immutable** on existing namespaces — you have to create entirely new namespaces just for backfill, doubling your operational complexity. + +2. **Massive block sizes were required.** To make the backfill namespaces work with `bufferPast=504h`, block sizes had to be enormous (30+ day blocks). This defeated the entire point of M3DB's time-partitioned storage — blocks that large cause extreme memory pressure, slow compaction, and bloated index lookups. + +3. **Downsample pipeline ignores historical data.** M3DB's downsample coordinator only processes new writes in real-time. Backfilled data written to `default_backfill` namespaces never gets downsampled into aggregated namespaces, so your long-term retention tiers have gaps. + +4. **No transaction boundaries.** Each backfill write is an individual operation. Writing 12M+ samples means 12M+ individual writes with no batching semantics. If one fails, there's no rollback, no retry from a checkpoint — you get partial data with no easy way to detect or fix gaps. + +5. **Compaction and flush chaos.** M3DB expects data to flow sequentially through commitlog → flush → compact. Backfill dumps data out of order, causing the background compaction to thrash, consuming CPU and I/O for blocks that may never be queried again. + +### What We Tried + +- Created `default_backfill`, `agg_10s_backfill`, `agg_1m_backfill` namespaces with `bufferPast=504h` +- Increased block sizes to 24h–30d to accommodate the large bufferPast +- Wrote 12M+ samples from Mimir to M3DB over multiple runs +- Result: Data landed, but the operational cost was catastrophic — huge blocks, no downsampling, and the cluster was unstable + +### What To Do Instead + +- **Start fresh.** Configure M3DB with sane block sizes (1h) from day one and let it accumulate data naturally via Prometheus remote_write. +- **Accept the gap.** Historical data lives in Mimir (or wherever it was before). Query Mimir for old data, M3DB for new data. +- **Dual-write during migration.** Write to both systems simultaneously until M3DB's retention catches up. +- **If you absolutely need old data in M3DB**, accept that you're doing a one-time migration and build tooling around the constraints — but know that it's a project, not a script. + +--- + ## Useful Commands ```bash diff --git a/backfill/BACKFILL_RUNBOOK.md b/backfill/BACKFILL_RUNBOOK.md deleted file mode 100644 index 37186a6..0000000 --- a/backfill/BACKFILL_RUNBOOK.md +++ /dev/null @@ -1,171 +0,0 @@ -# M3DB Backfill Runbook (Revised) - -## Context - -Backfilling ~3 weeks of vLLM + DCGM metrics from Mimir to M3DB. - -**Blocker discovered:** `bufferPast` is immutable on existing namespaces. Downsample pipeline rejects historical writes. - -**Solution:** Create new backfill namespaces with `bufferPast=504h` (21 days). - ---- - -## Step 1 — Create Backfill Namespaces - -```bash -COORD="http://m3coordinator.m3db.svc.cluster.local:7201" - -# default_backfill: 7d retention, 21d bufferPast -curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "default_backfill", - "options": { - "retentionOptions": { - "retentionPeriodDuration": "168h", - "blockSizeDuration": "2h", - "bufferFutureDuration": "10m", - "bufferPastDuration": "504h" - } - } - }' - -# agg_10s_backfill: 90d retention, 10s resolution, 21d bufferPast -curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "agg_10s_backfill", - "options": { - "retentionOptions": { - "retentionPeriodDuration": "2160h", - "blockSizeDuration": "24h", - "bufferFutureDuration": "10m", - "bufferPastDuration": "504h" - } - }, - "aggregationOptions": { - "aggregations": [{ - "aggregated": true, - "attributes": { - "resolutionNanos": "10000000000", - "downsampleOptions": {"all": true} - } - }] - } - }' - -# agg_1m_backfill: 1y retention, 1m resolution, 21d bufferPast -curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "agg_1m_backfill", - "options": { - "retentionOptions": { - "retentionPeriodDuration": "8760h", - "blockSizeDuration": "24h", - "bufferFutureDuration": "10m", - "bufferPastDuration": "504h" - } - }, - "aggregationOptions": { - "aggregations": [{ - "aggregated": true, - "attributes": { - "resolutionNanos": "60000000000", - "downsampleOptions": {"all": true} - } - }] - } - }' -``` - ---- - -## Step 2 — Update Coordinator ConfigMap - -Add new namespaces to `m3coordinator-config`: - -```yaml -clusters: - - namespaces: - - namespace: default - type: unaggregated - retention: 168h - - namespace: default_backfill - type: unaggregated - retention: 168h - - namespace: agg_10s_30d - type: aggregated - retention: 2160h - resolution: 10s - - namespace: agg_10s_backfill - type: aggregated - retention: 2160h - resolution: 10s - - namespace: agg_1m_1y - type: aggregated - retention: 8760h - resolution: 1m - - namespace: agg_1m_backfill - type: aggregated - retention: 8760h - resolution: 1m -``` - -Also add downsample rules for backfill namespaces. - ---- - -## Step 3 — Restart Coordinators - -```bash -kubectl rollout restart deployment/m3coordinator -n m3db -kubectl rollout status deployment/m3coordinator -n m3db --timeout=120s -``` - ---- - -## Step 4 — Run Backfill - -Write directly to `default_backfill` namespace using `__namespace__` label: - -```python -# In the protobuf write request, add label: -# __namespace__ = "default_backfill" -``` - -Or use the coordinator endpoint: -``` -POST http://m3coordinator:7201/api/v1/prom/remote/write?namespace=default_backfill -``` - -Backfill time range: `2026-03-11T00:00:00Z` to `2026-04-01T00:00:00Z` - ---- - -## Step 5 — Verify - -```bash -curl -sS "http://m3coordinator:7201/api/v1/query" \ - --data-urlencode 'query=vllm:prompt_tokens_total' \ - --data-urlencode 'time=2026-03-20T12:00:00Z' -``` - ---- - -## Step 6 — Revert bufferPast (After Backfill) - -```bash -# After backfill complete, shrink bufferPast back to 10m -# (Only retentionPeriod is mutable, so this requires namespace recreation) -# OR: Leave as-is since it's a backfill-only namespace -``` - ---- - -## Performance Notes - -- M3DB has been fast so far -- New namespaces won't impact existing query performance -- Queries can fan out to both old and new namespaces in parallel -- After backfill, consider consolidating (optional) diff --git a/backfill/README.md b/backfill/README.md deleted file mode 100644 index 0b9f169..0000000 --- a/backfill/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# M3DB Backfill Tools - -Scripts to backfill historical metrics from Mimir to M3DB. - -## Prerequisites - -Copy `.env` and set credentials: -```bash -cp .env.example .env -# Edit .env with your credentials -``` - -Required environment variables: -- `MIMIR_USERNAME` - Mimir API username -- `MIMIR_PASSWORD` - Mimir API password - -## Files - -| File | Purpose | -|------|---------| -| `backfill.py` | Main backfill script — pulls from Mimir, writes to M3DB | -| `backfill-gap.py` | Lightweight script for filling specific time gaps | -| `backfill-pod.yaml` | Kubernetes pod manifest for running backfill | -| `BACKFILL_RUNBOOK.md` | Detailed runbook with lessons learned | -| `test-metrics.py` | Test script for verifying data flow | - -## Quick Usage - -### Full Backfill - -```bash -# Edit START_TS and END_TS in backfill.py first -# Format: Unix timestamps (seconds since epoch) - -# Create configmap and run -kubectl create configmap backfill-script --from-file=backfill.py=backfill.py -n m3db -kubectl apply -f backfill-pod.yaml -kubectl logs -f backfill -n m3db -``` - -### Fill a Specific Gap - -Edit `backfill-gap.py` to set your time range: - -```python -START_TS = 1774175400 # Unix timestamp -END_TS = 1774243800 # Unix timestamp -``` - -Then run: - -```bash -kubectl create configmap backfill-gap-script --from-file=backfill-gap.py=backfill-gap.py -n m3db -kubectl apply -f backfill-gap-pod.yaml -kubectl logs -f backfill-gap -n m3db -``` - -## Timestamp Helpers - -```bash -# Convert date to Unix timestamp -date -u -d '2026-03-22 10:30:00' +%s - -# Convert Unix timestamp to date -date -u -d @1774175400 -``` - -## Requirements - -- Mimir credentials (in script) -- M3DB coordinator endpoint: `http://m3coordinator.m3db.svc.cluster.local:7201` -- `bufferPast` must be >= the age of data you're backfilling (currently 21 days) - -## Metrics Backfilled - -- `vllm:prompt_tokens_total` -- `vllm:generation_tokens_total` -- `DCGM_FI_DEV_GPU_UTIL` - -## Cleanup - -After backfill completes: - -```bash -kubectl delete pod backfill -n m3db -kubectl delete configmap backfill-script -n m3db -``` diff --git a/backfill/backfill-gap-pod.yaml b/backfill/backfill-gap-pod.yaml deleted file mode 100644 index e6b21ad..0000000 --- a/backfill/backfill-gap-pod.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: backfill-gap - namespace: m3db -spec: - restartPolicy: Never - volumes: - - name: script - configMap: - name: backfill-gap-script - containers: - - name: backfill - image: python:3.11-slim - command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-gap.py"] - volumeMounts: - - name: script - mountPath: /scripts diff --git a/backfill/backfill-gap.py b/backfill/backfill-gap.py deleted file mode 100644 index 33f4ef2..0000000 --- a/backfill/backfill-gap.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -"""Quick backfill for April 1 gap (10:44-11:50 UTC)""" -import struct -import urllib.request -import urllib.error -import urllib.parse -import json -import ssl -import snappy -import base64 - -# Read credentials from environment (see .env) -import os -MIMIR_URL = "https://metrics.vultrlabs.com/prometheus" -MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME") -MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD") -M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201" - -START_TS = 1774175400 # 2026-03-22T10:30:00Z -END_TS = 1774243800 # 2026-03-23T05:30:00Z -STEP = "10s" - -METRICS = ["vllm:prompt_tokens_total", "vllm:generation_tokens_total", "DCGM_FI_DEV_GPU_UTIL"] - -def enc(v): - b = v & 0x7f - v >>= 7 - r = b"" - while v: - r += bytes([0x80 | b]) - b = v & 0x7f - v >>= 7 - return r + bytes([b]) - -def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d -def ed(f, v): return enc((f<<3)|1) + struct.pack(" 0: - wr = b"" - for s in series: - labels = dict(s["metric"]) - labels["cluster"] = "serverless-inference-cluster" - pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]] - ts = build_ts(labels, pts) - wr += enc((1<<3)|2) + enc(len(ts)) + ts - if write_m3db(wr): - print(f"{samples} samples written") - total += samples - else: - print("no data") - -print(f"Done! Total: {total} samples") diff --git a/backfill/backfill-massive-pod.yaml b/backfill/backfill-massive-pod.yaml deleted file mode 100644 index 3f37074..0000000 --- a/backfill/backfill-massive-pod.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: backfill-massive - namespace: m3db -spec: - restartPolicy: Never - volumes: - - name: script - configMap: - name: backfill-massive-script - containers: - - name: backfill - image: python:3.11-slim - command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-massive.py"] - volumeMounts: - - name: script - mountPath: /scripts diff --git a/backfill/backfill-massive.py b/backfill/backfill-massive.py deleted file mode 100644 index 27f0017..0000000 --- a/backfill/backfill-massive.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 -""" -Massive backfill: March 12 - April 1, 2026 -Writes ONLY to 'default' namespace (raw data) -Overlapping chunks - no gaps! -""" -import struct -import urllib.request -import urllib.error -import urllib.parse -import json -import ssl -import snappy -import base64 -import time - -# Read credentials from environment (see .env) -import os -MIMIR_URL = "https://metrics.vultrlabs.com/prometheus" -MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME") -MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD") -M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201" - -# March 12 to April 1 (full range) -START_TS = 1773273600 # March 12 00:00 UTC -END_TS = 1775052000 # April 1 14:00 UTC -CHUNK_HOURS = 4 # 4-hour chunks -OVERLAP_MINUTES = 30 # 30-min overlap between chunks -STEP = "10s" - -METRICS = [ - "vllm:prompt_tokens_total", - "vllm:generation_tokens_total", - "DCGM_FI_DEV_GPU_UTIL", -] - -def enc(v): - b = v & 0x7f - v >>= 7 - r = b"" - while v: - r += bytes([0x80 | b]) - b = v & 0x7f - v >>= 7 - return r + bytes([b]) - -def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d -def ed(f, v): return enc((f<<3)|1) + struct.pack(">= 7 - r = b"" - while v: - r += bytes([0x80 | b]) - b = v & 0x7f - v >>= 7 - return r + bytes([b]) - -def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d -def ed(f, v): return enc((f<<3)|1) + struct.pack(" 0: - wr = b"" - for s in series: - labels = dict(s["metric"]) - labels["cluster"] = "serverless-inference-cluster" - pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]] - ts = build_ts(labels, pts) - wr += enc((1<<3)|2) + enc(len(ts)) + ts - if write_m3db(wr): - metric_total += samples - chunks_done += 1 - if chunks_done % 10 == 0: - print(f" {chunks_done} chunks, {metric_total} samples...", flush=True) - - except Exception as e: - print(f" Chunk error: {e}", flush=True) - - chunk_start = chunk_end - - print(f" Done: {metric_total} samples", flush=True) - total += metric_total - -print(f"\nBackfill complete! Total: {total} samples", flush=True) diff --git a/backfill/test-metrics.py b/backfill/test-metrics.py deleted file mode 100644 index 481462c..0000000 --- a/backfill/test-metrics.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for M3DB read/write functionality. -Usage: python3 test-metrics.py [USERNAME] [PASSWORD] - -Examples: - python3 test-metrics.py https://m3db.vultrlabs.dev example example - python3 test-metrics.py http://192.168.1.100:7201 -""" - -import sys -import time -import random -import requests - - -def main(): - if len(sys.argv) < 2: - print("Usage: python3 test-metrics.py [USERNAME] [PASSWORD]") - print("Example: python3 test-metrics.py https://m3db.vultrlabs.dev example example") - print(" python3 test-metrics.py http://192.168.1.100:7201") - sys.exit(1) - - base_url = sys.argv[1].rstrip('/') - username = sys.argv[2] if len(sys.argv) > 2 else None - password = sys.argv[3] if len(sys.argv) > 3 else None - - # Setup auth if provided - auth = (username, password) if username and password else None - - print(f"=== M3DB Metrics Test ===") - print(f"URL: {base_url}") - if auth: - print(f"Auth: {username}:***") - print() - - # Check coordinator health - print("=== Health Check ===") - health_url = f"{base_url}/health" - try: - resp = requests.get(health_url, auth=auth, timeout=10) - if resp.status_code == 200: - print(f"✓ Coordinator healthy") - elif resp.status_code == 401: - print(f"✗ Authentication required. Provide username and password.") - sys.exit(1) - else: - print(f"✗ Coordinator unhealthy: {resp.status_code}") - sys.exit(1) - except requests.exceptions.RequestException as e: - print(f"✗ Failed to connect: {e}") - sys.exit(1) - - # Check placement - print() - print("=== Placement ===") - placement_url = f"{base_url}/api/v1/services/m3db/placement" - try: - resp = requests.get(placement_url, auth=auth, timeout=10) - if resp.status_code == 200: - placement = resp.json() - instances = placement.get("placement", {}).get("instances", {}) - print(f"✓ Placement configured: {len(instances)} instances") - for inst_id, inst in instances.items(): - print(f" - {inst_id}: {inst.get('endpoint', 'unknown')}") - else: - print(f"✗ Placement not ready: {resp.status_code}") - print(f" Response: {resp.text}") - except requests.exceptions.RequestException as e: - print(f"✗ Failed to get placement: {e}") - - # Check namespaces - print() - print("=== Namespaces ===") - namespace_url = f"{base_url}/api/v1/services/m3db/namespace" - try: - resp = requests.get(namespace_url, auth=auth, timeout=10) - if resp.status_code == 200: - ns_data = resp.json() - namespaces = ns_data.get("namespaces", {}) - print(f"✓ Namespaces configured: {len(namespaces)}") - for ns_name in namespaces.keys(): - print(f" - {ns_name}") - else: - print(f"✗ Namespaces not ready: {resp.status_code}") - except requests.exceptions.RequestException as e: - print(f"✗ Failed to get namespaces: {e}") - - # Query test - print() - print("=== Query Test ===") - query_url = f"{base_url}/api/v1/query" - try: - resp = requests.get(query_url, params={"query": "up"}, auth=auth, timeout=10) - if resp.status_code == 200: - result = resp.json() - status = result.get("status") - print(f"✓ Query returned: {status}") - data = result.get("data", {}).get("result", []) - print(f" Results: {len(data)} series") - else: - print(f"✗ Query failed: {resp.status_code}") - except requests.exceptions.RequestException as e: - print(f"✗ Query failed: {e}") - - # Write test using Prometheus remote_write - print() - print("=== Write Test ===") - print("Writing metrics via Prometheus remote_write format...") - - try: - import struct - import snappy # pip install python-snappy - except ImportError: - print("✗ Missing dependencies for write test") - print(" Install with: pip install python-snappy") - print(" Skipping write test...") - print() - print("=== Test complete (read-only) ===") - return - - write_url = f"{base_url}/api/v1/prom/remote/write" - - def encode_varint(n): - """Encode a varint""" - result = [] - while n > 127: - result.append((n & 0x7F) | 0x80) - n >>= 7 - result.append(n) - return bytes(result) - - def encode_string(field_num, s): - """Encode a string field in protobuf""" - data = s.encode('utf-8') - tag = (field_num << 3) | 2 - return bytes([tag]) + encode_varint(len(data)) + data - - def encode_double(field_num, value): - """Encode a double field in protobuf""" - tag = (field_num << 3) | 1 - return bytes([tag]) + struct.pack(' [USERNAME] [PASSWORD] -# -# Examples: -# ./test-metrics.sh https://m3db.vultrlabs.dev example example -# ./test-metrics.sh http://192.168.1.100:7201 -# - -set -e - -BASE_URL="${1:-}" -USERNAME="${2:-}" -PASSWORD="${3:-}" - -if [ -z "$BASE_URL" ]; then - echo "Usage: $0 [USERNAME] [PASSWORD]" - echo "Example: $0 https://m3db.vultrlabs.dev example example" - echo " $0 http://192.168.1.100:7201" - exit 1 -fi - -# Remove trailing slash if present -BASE_URL="${BASE_URL%/}" - -# Build auth flag if credentials provided -AUTH_FLAG="" -if [ -n "$USERNAME" ] && [ -n "$PASSWORD" ]; then - AUTH_FLAG="-u ${USERNAME}:${PASSWORD}" -fi - -echo "=== M3DB Connectivity Test ===" -echo "Target: ${BASE_URL}" -if [ -n "$AUTH_FLAG" ]; then - echo "Auth: ${USERNAME}:***" -fi -echo "" - -# Health check -echo "1. Coordinator Health" -if curl -sf $AUTH_FLAG "${BASE_URL}/health" > /dev/null 2>&1; then - echo " ✓ Healthy" -else - echo " ✗ Unhealthy or unreachable" - exit 1 -fi - -# Placement -echo "" -echo "2. Placement (cluster topology)" -PLACEMENT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}') -INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0") -if [ "$INSTANCE_COUNT" -gt 0 ]; then - echo " ✓ $INSTANCE_COUNT instances in placement" - echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true -else - echo " ✗ No placement configured (run init job)" -fi - -# Namespaces -echo "" -echo "3. Namespaces (retention policies)" -NAMESPACES=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}') -NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0") -if [ "$NS_COUNT" -gt 0 ]; then - echo " ✓ $NS_COUNT namespaces configured" - echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true -else - echo " ✗ No namespaces configured (run init job)" -fi - -# Query test -echo "" -echo "4. Query Test (PromQL)" -QUERY_RESULT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}') -STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error") -if [ "$STATUS" = "success" ]; then - RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0") - echo " ✓ Query returned: $RESULT_COUNT series" -else - echo " ✗ Query failed" -fi - -# Write test (requires protobuf + snappy, so just note it) -echo "" -echo "5. Write Test" -echo " Note: Prometheus remote_write requires protobuf + snappy encoding." -echo " Use test-metrics.py for full write/read verification." -echo " Install: pip install python-snappy requests" - -echo "" -echo "=== Test Complete ===" diff --git a/kustomization.yaml b/kustomization.yaml index 7984b74..ba02f1f 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -7,5 +7,5 @@ resources: - 02-etcd.yaml - 03-configmaps.yaml - 04-m3dbnode.yaml - - 05-m3coordinator.yaml + - 05-m3coordinator-deployment.yaml - 06-init-and-pdb.yaml