Clean slate: 1h block sizes, remove backfill artifacts
- Changed all namespace block sizes to 1h (was 2h/12h/24h in manifests, 30d+ in the live cluster due to backfill-era bufferPast hacks) - Deleted entire backfill/ directory (scripts, pods, runbooks) - Removed stale 05-m3coordinator.yaml (had backfill namespaces) - Added 05-m3coordinator-deployment.yaml to kustomization - Fixed init job health check (/health instead of /api/v1/services/m3db/health) - Updated .env.example (removed Mimir credentials) - Added 'Why Backfill Doesn't Work' section to README
This commit is contained in:
@@ -6,10 +6,6 @@
|
||||
M3DB_USERNAME=<your-m3db-username>
|
||||
M3DB_PASSWORD=<your-m3db-password>
|
||||
|
||||
# Mimir (source for backfill)
|
||||
MIMIR_USERNAME=<your-mimir-username>
|
||||
MIMIR_PASSWORD=<your-mimir-password>
|
||||
|
||||
# Grafana Admin
|
||||
GRAFANA_ADMIN_PASSWORD=<your-grafana-admin-password>
|
||||
|
||||
|
||||
63
05-m3coordinator-deployment.yaml
Normal file
63
05-m3coordinator-deployment.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: m3coordinator
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
spec:
|
||||
containers:
|
||||
- name: m3coordinator
|
||||
image: quay.io/m3db/m3coordinator:v1.5.0
|
||||
args:
|
||||
- -f
|
||||
- /etc/m3coordinator/m3coordinator.yml
|
||||
ports:
|
||||
- name: api
|
||||
containerPort: 7201
|
||||
- name: metrics
|
||||
containerPort: 7203
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 7201
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 7201
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/m3coordinator
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: m3coordinator-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: m3coordinator
|
||||
namespace: m3db
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
ports:
|
||||
- name: api
|
||||
port: 7201
|
||||
targetPort: api
|
||||
- name: metrics
|
||||
port: 7203
|
||||
targetPort: metrics
|
||||
@@ -1,70 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: m3coordinator-config
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
data:
|
||||
m3coordinator.yml: |
|
||||
listenAddress: 0.0.0.0:7201
|
||||
|
||||
logging:
|
||||
level: info
|
||||
|
||||
metrics:
|
||||
scope:
|
||||
prefix: coordinator
|
||||
prometheus:
|
||||
handlerPath: /metrics
|
||||
listenAddress: 0.0.0.0:7203
|
||||
sanitization: prometheus
|
||||
samplingRate: 1.0
|
||||
|
||||
tagOptions:
|
||||
idScheme: quoted
|
||||
|
||||
clusters:
|
||||
- namespaces:
|
||||
- namespace: default
|
||||
type: unaggregated
|
||||
retention: 720h
|
||||
- namespace: agg_1m_60d
|
||||
type: aggregated
|
||||
retention: 1440h
|
||||
resolution: 1m
|
||||
- namespace: agg_1h_1y
|
||||
type: aggregated
|
||||
retention: 8760h
|
||||
resolution: 1h
|
||||
client:
|
||||
config:
|
||||
service:
|
||||
env: default_env
|
||||
zone: embedded
|
||||
service: m3db
|
||||
cacheDir: /var/lib/m3kv
|
||||
etcdClusters:
|
||||
- zone: embedded
|
||||
endpoints:
|
||||
- http://etcd-0.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-1.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-2.etcd.m3db.svc.cluster.local:2379
|
||||
writeConsistencyLevel: majority
|
||||
readConsistencyLevel: unstrict_majority
|
||||
|
||||
downsample:
|
||||
rules:
|
||||
mappingRules:
|
||||
- name: "1min for 60 days"
|
||||
filter: "__name__:*"
|
||||
aggregations: ["Last"]
|
||||
storagePolicies:
|
||||
- resolution: 1m
|
||||
retention: 1440h
|
||||
- name: "1hour for 1 year"
|
||||
filter: "__name__:*"
|
||||
aggregations: ["Last"]
|
||||
storagePolicies:
|
||||
- resolution: 1h
|
||||
retention: 8760h
|
||||
@@ -63,7 +63,7 @@ spec:
|
||||
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
echo "=== Waiting for coordinator to be healthy ==="
|
||||
until curl -sf "${COORD}/api/v1/services/m3db/health"; do
|
||||
until curl -sf "${COORD}/health"; do
|
||||
echo "Coordinator not ready yet, retrying in 5s..."
|
||||
sleep 5
|
||||
done
|
||||
@@ -121,13 +121,13 @@ spec:
|
||||
"repairEnabled": false,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "48h",
|
||||
"blockSizeDuration": "2h",
|
||||
"blockSizeDuration": "1h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "2h"
|
||||
"blockSizeDuration": "1h"
|
||||
}
|
||||
}
|
||||
}'
|
||||
@@ -146,13 +146,13 @@ spec:
|
||||
"snapshotEnabled": true,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "720h",
|
||||
"blockSizeDuration": "12h",
|
||||
"blockSizeDuration": "1h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "12h"
|
||||
"blockSizeDuration": "1h"
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [
|
||||
@@ -181,13 +181,13 @@ spec:
|
||||
"snapshotEnabled": true,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "8760h",
|
||||
"blockSizeDuration": "24h",
|
||||
"blockSizeDuration": "1h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "24h"
|
||||
"blockSizeDuration": "1h"
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [
|
||||
|
||||
52
README.md
52
README.md
@@ -44,11 +44,13 @@ Internet → Vultr LoadBalancer → Traefik (TLS + basic auth) → m3coordinator
|
||||
|
||||
## Retention Tiers
|
||||
|
||||
| Namespace | Resolution | Retention | Use Case |
|
||||
|----------------|-----------|-----------|---------------------------|
|
||||
| `default` | raw | 48h | Real-time queries |
|
||||
| `agg_10s_30d` | 10s | 30 days | Recent dashboards |
|
||||
| `agg_1m_1y` | 1m | 1 year | Long-term trends/capacity |
|
||||
All namespaces use **1h block size** — the sweet spot for M3DB. Smaller blocks mean faster queries, faster flushes, and less memory pressure during compaction. See [Why Backfill Doesn't Work](#why-backfill-doesnt-work) for why larger blocks were a disaster.
|
||||
|
||||
| Namespace | Resolution | Retention | Block Size | Use Case |
|
||||
|----------------|-----------|-----------|------------|---------------------------|
|
||||
| `default` | raw | 48h | 1h | Real-time queries |
|
||||
| `agg_10s_30d` | 10s | 30 days | 1h | Recent dashboards |
|
||||
| `agg_1m_1y` | 1m | 1 year | 1h | Long-term trends/capacity |
|
||||
|
||||
## Deployment
|
||||
|
||||
@@ -96,13 +98,13 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/
|
||||
|
||||
# Create namespaces
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||
-H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}'
|
||||
-H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"}}}'
|
||||
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||
-H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"12h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"12h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'
|
||||
-H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'
|
||||
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||
-H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
|
||||
-H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"1h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"1h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
|
||||
|
||||
# Wait for bootstrapping to complete (check shard state = AVAILABLE)
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
|
||||
@@ -250,6 +252,40 @@ remote_write:
|
||||
- **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
|
||||
- **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.
|
||||
|
||||
## Why Backfill Doesn't Work
|
||||
|
||||
**TL;DR: M3DB is not designed for historical data import. Don't try it.**
|
||||
|
||||
M3DB is a time-series database optimized for real-time ingestion and sequential writes. Backfilling — writing data with timestamps in the past — fights the fundamental architecture at every turn:
|
||||
|
||||
### The Problems
|
||||
|
||||
1. **`bufferPast` is a hard gate.** M3DB rejects writes whose timestamps fall outside the `bufferPast` window (default: 10m). To write data from 3 weeks ago, you need `bufferPast=504h` (21 days). This setting is **immutable** on existing namespaces — you have to create entirely new namespaces just for backfill, doubling your operational complexity.
|
||||
|
||||
2. **Massive block sizes were required.** To make the backfill namespaces work with `bufferPast=504h`, block sizes had to be enormous (30+ day blocks). This defeated the entire point of M3DB's time-partitioned storage — blocks that large cause extreme memory pressure, slow compaction, and bloated index lookups.
|
||||
|
||||
3. **Downsample pipeline ignores historical data.** M3DB's downsample coordinator only processes new writes in real-time. Backfilled data written to `default_backfill` namespaces never gets downsampled into aggregated namespaces, so your long-term retention tiers have gaps.
|
||||
|
||||
4. **No transaction boundaries.** Each backfill write is an individual operation. Writing 12M+ samples means 12M+ individual writes with no batching semantics. If one fails, there's no rollback, no retry from a checkpoint — you get partial data with no easy way to detect or fix gaps.
|
||||
|
||||
5. **Compaction and flush chaos.** M3DB expects data to flow sequentially through commitlog → flush → compact. Backfill dumps data out of order, causing the background compaction to thrash, consuming CPU and I/O for blocks that may never be queried again.
|
||||
|
||||
### What We Tried
|
||||
|
||||
- Created `default_backfill`, `agg_10s_backfill`, `agg_1m_backfill` namespaces with `bufferPast=504h`
|
||||
- Increased block sizes to 24h–30d to accommodate the large bufferPast
|
||||
- Wrote 12M+ samples from Mimir to M3DB over multiple runs
|
||||
- Result: Data landed, but the operational cost was catastrophic — huge blocks, no downsampling, and the cluster was unstable
|
||||
|
||||
### What To Do Instead
|
||||
|
||||
- **Start fresh.** Configure M3DB with sane block sizes (1h) from day one and let it accumulate data naturally via Prometheus remote_write.
|
||||
- **Accept the gap.** Historical data lives in Mimir (or wherever it was before). Query Mimir for old data, M3DB for new data.
|
||||
- **Dual-write during migration.** Write to both systems simultaneously until M3DB's retention catches up.
|
||||
- **If you absolutely need old data in M3DB**, accept that you're doing a one-time migration and build tooling around the constraints — but know that it's a project, not a script.
|
||||
|
||||
---
|
||||
|
||||
## Useful Commands
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,171 +0,0 @@
|
||||
# M3DB Backfill Runbook (Revised)
|
||||
|
||||
## Context
|
||||
|
||||
Backfilling ~3 weeks of vLLM + DCGM metrics from Mimir to M3DB.
|
||||
|
||||
**Blocker discovered:** `bufferPast` is immutable on existing namespaces. Downsample pipeline rejects historical writes.
|
||||
|
||||
**Solution:** Create new backfill namespaces with `bufferPast=504h` (21 days).
|
||||
|
||||
---
|
||||
|
||||
## Step 1 — Create Backfill Namespaces
|
||||
|
||||
```bash
|
||||
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
# default_backfill: 7d retention, 21d bufferPast
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "default_backfill",
|
||||
"options": {
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "168h",
|
||||
"blockSizeDuration": "2h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "504h"
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
# agg_10s_backfill: 90d retention, 10s resolution, 21d bufferPast
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "agg_10s_backfill",
|
||||
"options": {
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "2160h",
|
||||
"blockSizeDuration": "24h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "504h"
|
||||
}
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [{
|
||||
"aggregated": true,
|
||||
"attributes": {
|
||||
"resolutionNanos": "10000000000",
|
||||
"downsampleOptions": {"all": true}
|
||||
}
|
||||
}]
|
||||
}
|
||||
}'
|
||||
|
||||
# agg_1m_backfill: 1y retention, 1m resolution, 21d bufferPast
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "agg_1m_backfill",
|
||||
"options": {
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "8760h",
|
||||
"blockSizeDuration": "24h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "504h"
|
||||
}
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [{
|
||||
"aggregated": true,
|
||||
"attributes": {
|
||||
"resolutionNanos": "60000000000",
|
||||
"downsampleOptions": {"all": true}
|
||||
}
|
||||
}]
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2 — Update Coordinator ConfigMap
|
||||
|
||||
Add new namespaces to `m3coordinator-config`:
|
||||
|
||||
```yaml
|
||||
clusters:
|
||||
- namespaces:
|
||||
- namespace: default
|
||||
type: unaggregated
|
||||
retention: 168h
|
||||
- namespace: default_backfill
|
||||
type: unaggregated
|
||||
retention: 168h
|
||||
- namespace: agg_10s_30d
|
||||
type: aggregated
|
||||
retention: 2160h
|
||||
resolution: 10s
|
||||
- namespace: agg_10s_backfill
|
||||
type: aggregated
|
||||
retention: 2160h
|
||||
resolution: 10s
|
||||
- namespace: agg_1m_1y
|
||||
type: aggregated
|
||||
retention: 8760h
|
||||
resolution: 1m
|
||||
- namespace: agg_1m_backfill
|
||||
type: aggregated
|
||||
retention: 8760h
|
||||
resolution: 1m
|
||||
```
|
||||
|
||||
Also add downsample rules for backfill namespaces.
|
||||
|
||||
---
|
||||
|
||||
## Step 3 — Restart Coordinators
|
||||
|
||||
```bash
|
||||
kubectl rollout restart deployment/m3coordinator -n m3db
|
||||
kubectl rollout status deployment/m3coordinator -n m3db --timeout=120s
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4 — Run Backfill
|
||||
|
||||
Write directly to `default_backfill` namespace using `__namespace__` label:
|
||||
|
||||
```python
|
||||
# In the protobuf write request, add label:
|
||||
# __namespace__ = "default_backfill"
|
||||
```
|
||||
|
||||
Or use the coordinator endpoint:
|
||||
```
|
||||
POST http://m3coordinator:7201/api/v1/prom/remote/write?namespace=default_backfill
|
||||
```
|
||||
|
||||
Backfill time range: `2026-03-11T00:00:00Z` to `2026-04-01T00:00:00Z`
|
||||
|
||||
---
|
||||
|
||||
## Step 5 — Verify
|
||||
|
||||
```bash
|
||||
curl -sS "http://m3coordinator:7201/api/v1/query" \
|
||||
--data-urlencode 'query=vllm:prompt_tokens_total' \
|
||||
--data-urlencode 'time=2026-03-20T12:00:00Z'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 6 — Revert bufferPast (After Backfill)
|
||||
|
||||
```bash
|
||||
# After backfill complete, shrink bufferPast back to 10m
|
||||
# (Only retentionPeriod is mutable, so this requires namespace recreation)
|
||||
# OR: Leave as-is since it's a backfill-only namespace
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Notes
|
||||
|
||||
- M3DB has been fast so far
|
||||
- New namespaces won't impact existing query performance
|
||||
- Queries can fan out to both old and new namespaces in parallel
|
||||
- After backfill, consider consolidating (optional)
|
||||
@@ -1,87 +0,0 @@
|
||||
# M3DB Backfill Tools
|
||||
|
||||
Scripts to backfill historical metrics from Mimir to M3DB.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Copy `.env` and set credentials:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env with your credentials
|
||||
```
|
||||
|
||||
Required environment variables:
|
||||
- `MIMIR_USERNAME` - Mimir API username
|
||||
- `MIMIR_PASSWORD` - Mimir API password
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `backfill.py` | Main backfill script — pulls from Mimir, writes to M3DB |
|
||||
| `backfill-gap.py` | Lightweight script for filling specific time gaps |
|
||||
| `backfill-pod.yaml` | Kubernetes pod manifest for running backfill |
|
||||
| `BACKFILL_RUNBOOK.md` | Detailed runbook with lessons learned |
|
||||
| `test-metrics.py` | Test script for verifying data flow |
|
||||
|
||||
## Quick Usage
|
||||
|
||||
### Full Backfill
|
||||
|
||||
```bash
|
||||
# Edit START_TS and END_TS in backfill.py first
|
||||
# Format: Unix timestamps (seconds since epoch)
|
||||
|
||||
# Create configmap and run
|
||||
kubectl create configmap backfill-script --from-file=backfill.py=backfill.py -n m3db
|
||||
kubectl apply -f backfill-pod.yaml
|
||||
kubectl logs -f backfill -n m3db
|
||||
```
|
||||
|
||||
### Fill a Specific Gap
|
||||
|
||||
Edit `backfill-gap.py` to set your time range:
|
||||
|
||||
```python
|
||||
START_TS = 1774175400 # Unix timestamp
|
||||
END_TS = 1774243800 # Unix timestamp
|
||||
```
|
||||
|
||||
Then run:
|
||||
|
||||
```bash
|
||||
kubectl create configmap backfill-gap-script --from-file=backfill-gap.py=backfill-gap.py -n m3db
|
||||
kubectl apply -f backfill-gap-pod.yaml
|
||||
kubectl logs -f backfill-gap -n m3db
|
||||
```
|
||||
|
||||
## Timestamp Helpers
|
||||
|
||||
```bash
|
||||
# Convert date to Unix timestamp
|
||||
date -u -d '2026-03-22 10:30:00' +%s
|
||||
|
||||
# Convert Unix timestamp to date
|
||||
date -u -d @1774175400
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Mimir credentials (in script)
|
||||
- M3DB coordinator endpoint: `http://m3coordinator.m3db.svc.cluster.local:7201`
|
||||
- `bufferPast` must be >= the age of data you're backfilling (currently 21 days)
|
||||
|
||||
## Metrics Backfilled
|
||||
|
||||
- `vllm:prompt_tokens_total`
|
||||
- `vllm:generation_tokens_total`
|
||||
- `DCGM_FI_DEV_GPU_UTIL`
|
||||
|
||||
## Cleanup
|
||||
|
||||
After backfill completes:
|
||||
|
||||
```bash
|
||||
kubectl delete pod backfill -n m3db
|
||||
kubectl delete configmap backfill-script -n m3db
|
||||
```
|
||||
@@ -1,18 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: backfill-gap
|
||||
namespace: m3db
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: backfill-gap-script
|
||||
containers:
|
||||
- name: backfill
|
||||
image: python:3.11-slim
|
||||
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-gap.py"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
@@ -1,100 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Quick backfill for April 1 gap (10:44-11:50 UTC)"""
|
||||
import struct
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import json
|
||||
import ssl
|
||||
import snappy
|
||||
import base64
|
||||
|
||||
# Read credentials from environment (see .env)
|
||||
import os
|
||||
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
START_TS = 1774175400 # 2026-03-22T10:30:00Z
|
||||
END_TS = 1774243800 # 2026-03-23T05:30:00Z
|
||||
STEP = "10s"
|
||||
|
||||
METRICS = ["vllm:prompt_tokens_total", "vllm:generation_tokens_total", "DCGM_FI_DEV_GPU_UTIL"]
|
||||
|
||||
def enc(v):
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
r = b""
|
||||
while v:
|
||||
r += bytes([0x80 | b])
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
return r + bytes([b])
|
||||
|
||||
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||
|
||||
def build_ts(labels, samples):
|
||||
ts = b""
|
||||
for n, v in labels.items():
|
||||
l = es(1, n.encode()) + es(2, v.encode())
|
||||
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||
for t_ms, val in samples:
|
||||
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||
return ts
|
||||
|
||||
def ssl_ctx():
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
def mimir_req(path):
|
||||
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||
req = urllib.request.Request(f"{MIMIR_URL}{path}")
|
||||
req.add_header("Authorization", f"Basic {auth}")
|
||||
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
def write_m3db(data):
|
||||
c = snappy.compress(data)
|
||||
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||
req.add_header("Content-Type", "application/x-protobuf")
|
||||
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
req.add_header("Content-Encoding", "snappy")
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=300)
|
||||
return True
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f" ERROR {e.code}: {e.read().decode()[:100]}")
|
||||
return False
|
||||
|
||||
print(f"Filling gap: {START_TS} to {END_TS}")
|
||||
total = 0
|
||||
|
||||
for metric in METRICS:
|
||||
print(f"{metric}...", end=" ", flush=True)
|
||||
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={START_TS}&end={END_TS}&step={STEP}"
|
||||
data = mimir_req(path)
|
||||
if data["status"] != "success":
|
||||
print("failed")
|
||||
continue
|
||||
series = data["data"]["result"]
|
||||
samples = sum(len(s["values"]) for s in series)
|
||||
if samples > 0:
|
||||
wr = b""
|
||||
for s in series:
|
||||
labels = dict(s["metric"])
|
||||
labels["cluster"] = "serverless-inference-cluster"
|
||||
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||
ts = build_ts(labels, pts)
|
||||
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||
if write_m3db(wr):
|
||||
print(f"{samples} samples written")
|
||||
total += samples
|
||||
else:
|
||||
print("no data")
|
||||
|
||||
print(f"Done! Total: {total} samples")
|
||||
@@ -1,18 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: backfill-massive
|
||||
namespace: m3db
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: backfill-massive-script
|
||||
containers:
|
||||
- name: backfill
|
||||
image: python:3.11-slim
|
||||
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-massive.py"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
@@ -1,153 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Massive backfill: March 12 - April 1, 2026
|
||||
Writes ONLY to 'default' namespace (raw data)
|
||||
Overlapping chunks - no gaps!
|
||||
"""
|
||||
import struct
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import json
|
||||
import ssl
|
||||
import snappy
|
||||
import base64
|
||||
import time
|
||||
|
||||
# Read credentials from environment (see .env)
|
||||
import os
|
||||
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
# March 12 to April 1 (full range)
|
||||
START_TS = 1773273600 # March 12 00:00 UTC
|
||||
END_TS = 1775052000 # April 1 14:00 UTC
|
||||
CHUNK_HOURS = 4 # 4-hour chunks
|
||||
OVERLAP_MINUTES = 30 # 30-min overlap between chunks
|
||||
STEP = "10s"
|
||||
|
||||
METRICS = [
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"DCGM_FI_DEV_GPU_UTIL",
|
||||
]
|
||||
|
||||
def enc(v):
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
r = b""
|
||||
while v:
|
||||
r += bytes([0x80 | b])
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
return r + bytes([b])
|
||||
|
||||
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||
|
||||
def build_ts(labels, samples):
|
||||
ts = b""
|
||||
for n, v in labels.items():
|
||||
l = es(1, n.encode()) + es(2, v.encode())
|
||||
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||
for t_ms, val in samples:
|
||||
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||
return ts
|
||||
|
||||
def ssl_ctx():
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
def mimir_req(path):
|
||||
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||
url = f"{MIMIR_URL}{path}"
|
||||
req = urllib.request.Request(url)
|
||||
req.add_header("Authorization", f"Basic {auth}")
|
||||
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
def write_m3db(data):
|
||||
c = snappy.compress(data)
|
||||
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||
req.add_header("Content-Type", "application/x-protobuf")
|
||||
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
req.add_header("Content-Encoding", "snappy")
|
||||
# TARGET ONLY DEFAULT NAMESPACE
|
||||
req.add_header("X-M3-Namespace", "default")
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=300)
|
||||
return True
|
||||
except urllib.error.HTTPError as e:
|
||||
err = e.read().decode()[:200]
|
||||
print(f" WRITE ERROR {e.code}: {err}")
|
||||
return False
|
||||
|
||||
print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
|
||||
print(f"Range: March 12 - April 1, 2026")
|
||||
print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
|
||||
print(f"Metrics: {len(METRICS)}")
|
||||
print("="*60)
|
||||
|
||||
total_samples = 0
|
||||
start_time = time.time()
|
||||
chunk_seconds = CHUNK_HOURS * 3600
|
||||
overlap_seconds = OVERLAP_MINUTES * 60
|
||||
|
||||
for metric in METRICS:
|
||||
print(f"\n{metric}:")
|
||||
metric_samples = 0
|
||||
chunk_num = 0
|
||||
|
||||
chunk_start = START_TS
|
||||
while chunk_start < END_TS:
|
||||
chunk_end = min(chunk_start + chunk_seconds, END_TS)
|
||||
chunk_num += 1
|
||||
|
||||
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||||
|
||||
try:
|
||||
data = mimir_req(path)
|
||||
except Exception as e:
|
||||
print(f" Chunk {chunk_num}: QUERY FAILED: {e}")
|
||||
chunk_start = chunk_end - overlap_seconds
|
||||
continue
|
||||
|
||||
if data.get("status") != "success":
|
||||
print(f" Chunk {chunk_num}: status={data.get('status')}")
|
||||
chunk_start = chunk_end - overlap_seconds
|
||||
continue
|
||||
|
||||
series = data["data"]["result"]
|
||||
samples = sum(len(s["values"]) for s in series)
|
||||
|
||||
if samples == 0:
|
||||
chunk_start = chunk_end - overlap_seconds
|
||||
continue
|
||||
|
||||
wr = b""
|
||||
for s in series:
|
||||
labels = dict(s["metric"])
|
||||
labels["cluster"] = "serverless-inference-cluster"
|
||||
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||
ts = build_ts(labels, pts)
|
||||
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||
|
||||
if write_m3db(wr):
|
||||
metric_samples += samples
|
||||
hrs = (chunk_end - chunk_start) / 3600
|
||||
print(f" Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
|
||||
|
||||
# Next chunk starts with overlap
|
||||
chunk_start = chunk_end - overlap_seconds
|
||||
|
||||
total_samples += metric_samples
|
||||
print(f" TOTAL {metric}: {metric_samples:,} samples")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print("="*60)
|
||||
print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")
|
||||
@@ -1,18 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: backfill-v2
|
||||
namespace: m3db
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: backfill-script-v2
|
||||
containers:
|
||||
- name: backfill
|
||||
image: python:3.11-slim
|
||||
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill.py"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
@@ -1,124 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""M3DB Backfill - Pull vLLM/DCGM metrics from Mimir and write to M3DB"""
|
||||
import struct
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import json
|
||||
import ssl
|
||||
import snappy
|
||||
import base64
|
||||
import sys
|
||||
|
||||
print("Starting backfill script...", flush=True)
|
||||
|
||||
# Read credentials from environment (see .env)
|
||||
import os
|
||||
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
START_TS = 1773187200 # 2026-03-11T00:00:00Z
|
||||
END_TS = 1775040000 # 2026-04-01T11:40:00Z (just before node restart)
|
||||
STEP = "10s"
|
||||
CHUNK_HOURS = 6
|
||||
|
||||
METRICS = [
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"DCGM_FI_DEV_GPU_UTIL",
|
||||
]
|
||||
|
||||
def enc(v):
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
r = b""
|
||||
while v:
|
||||
r += bytes([0x80 | b])
|
||||
b = v & 0x7f
|
||||
v >>= 7
|
||||
return r + bytes([b])
|
||||
|
||||
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||
|
||||
def build_ts(labels, samples):
|
||||
ts = b""
|
||||
for n, v in labels.items():
|
||||
l = es(1, n.encode()) + es(2, v.encode())
|
||||
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||
for t_ms, val in samples:
|
||||
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||
return ts
|
||||
|
||||
def ssl_ctx():
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
def mimir_req(path):
|
||||
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||
req = urllib.request.Request(f"{MIMIR_URL}{path}")
|
||||
req.add_header("Authorization", f"Basic {auth}")
|
||||
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
def write_m3db(data):
|
||||
c = snappy.compress(data)
|
||||
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||
req.add_header("Content-Type", "application/x-protobuf")
|
||||
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||
req.add_header("Content-Encoding", "snappy")
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=300)
|
||||
return True
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f" ERROR {e.code}: {e.read().decode()[:100]}", flush=True)
|
||||
return False
|
||||
|
||||
print(f"Time range: {START_TS} to {END_TS}", flush=True)
|
||||
total = 0
|
||||
|
||||
for metric in METRICS:
|
||||
print(f"\n{metric}...", flush=True)
|
||||
metric_total = 0
|
||||
chunk_start = START_TS
|
||||
chunks_done = 0
|
||||
|
||||
while chunk_start < END_TS:
|
||||
chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
|
||||
try:
|
||||
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||||
data = mimir_req(path)
|
||||
if data["status"] != "success":
|
||||
chunk_start = chunk_end
|
||||
continue
|
||||
|
||||
series = data["data"]["result"]
|
||||
samples = sum(len(s["values"]) for s in series)
|
||||
if samples > 0:
|
||||
wr = b""
|
||||
for s in series:
|
||||
labels = dict(s["metric"])
|
||||
labels["cluster"] = "serverless-inference-cluster"
|
||||
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||
ts = build_ts(labels, pts)
|
||||
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||
if write_m3db(wr):
|
||||
metric_total += samples
|
||||
chunks_done += 1
|
||||
if chunks_done % 10 == 0:
|
||||
print(f" {chunks_done} chunks, {metric_total} samples...", flush=True)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Chunk error: {e}", flush=True)
|
||||
|
||||
chunk_start = chunk_end
|
||||
|
||||
print(f" Done: {metric_total} samples", flush=True)
|
||||
total += metric_total
|
||||
|
||||
print(f"\nBackfill complete! Total: {total} samples", flush=True)
|
||||
@@ -1,245 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for M3DB read/write functionality.
|
||||
Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]
|
||||
|
||||
Examples:
|
||||
python3 test-metrics.py https://m3db.vultrlabs.dev example example
|
||||
python3 test-metrics.py http://192.168.1.100:7201
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import random
|
||||
import requests
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]")
|
||||
print("Example: python3 test-metrics.py https://m3db.vultrlabs.dev example example")
|
||||
print(" python3 test-metrics.py http://192.168.1.100:7201")
|
||||
sys.exit(1)
|
||||
|
||||
base_url = sys.argv[1].rstrip('/')
|
||||
username = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
password = sys.argv[3] if len(sys.argv) > 3 else None
|
||||
|
||||
# Setup auth if provided
|
||||
auth = (username, password) if username and password else None
|
||||
|
||||
print(f"=== M3DB Metrics Test ===")
|
||||
print(f"URL: {base_url}")
|
||||
if auth:
|
||||
print(f"Auth: {username}:***")
|
||||
print()
|
||||
|
||||
# Check coordinator health
|
||||
print("=== Health Check ===")
|
||||
health_url = f"{base_url}/health"
|
||||
try:
|
||||
resp = requests.get(health_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
print(f"✓ Coordinator healthy")
|
||||
elif resp.status_code == 401:
|
||||
print(f"✗ Authentication required. Provide username and password.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"✗ Coordinator unhealthy: {resp.status_code}")
|
||||
sys.exit(1)
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Failed to connect: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Check placement
|
||||
print()
|
||||
print("=== Placement ===")
|
||||
placement_url = f"{base_url}/api/v1/services/m3db/placement"
|
||||
try:
|
||||
resp = requests.get(placement_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
placement = resp.json()
|
||||
instances = placement.get("placement", {}).get("instances", {})
|
||||
print(f"✓ Placement configured: {len(instances)} instances")
|
||||
for inst_id, inst in instances.items():
|
||||
print(f" - {inst_id}: {inst.get('endpoint', 'unknown')}")
|
||||
else:
|
||||
print(f"✗ Placement not ready: {resp.status_code}")
|
||||
print(f" Response: {resp.text}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Failed to get placement: {e}")
|
||||
|
||||
# Check namespaces
|
||||
print()
|
||||
print("=== Namespaces ===")
|
||||
namespace_url = f"{base_url}/api/v1/services/m3db/namespace"
|
||||
try:
|
||||
resp = requests.get(namespace_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
ns_data = resp.json()
|
||||
namespaces = ns_data.get("namespaces", {})
|
||||
print(f"✓ Namespaces configured: {len(namespaces)}")
|
||||
for ns_name in namespaces.keys():
|
||||
print(f" - {ns_name}")
|
||||
else:
|
||||
print(f"✗ Namespaces not ready: {resp.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Failed to get namespaces: {e}")
|
||||
|
||||
# Query test
|
||||
print()
|
||||
print("=== Query Test ===")
|
||||
query_url = f"{base_url}/api/v1/query"
|
||||
try:
|
||||
resp = requests.get(query_url, params={"query": "up"}, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
status = result.get("status")
|
||||
print(f"✓ Query returned: {status}")
|
||||
data = result.get("data", {}).get("result", [])
|
||||
print(f" Results: {len(data)} series")
|
||||
else:
|
||||
print(f"✗ Query failed: {resp.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Query failed: {e}")
|
||||
|
||||
# Write test using Prometheus remote_write
|
||||
print()
|
||||
print("=== Write Test ===")
|
||||
print("Writing metrics via Prometheus remote_write format...")
|
||||
|
||||
try:
|
||||
import struct
|
||||
import snappy # pip install python-snappy
|
||||
except ImportError:
|
||||
print("✗ Missing dependencies for write test")
|
||||
print(" Install with: pip install python-snappy")
|
||||
print(" Skipping write test...")
|
||||
print()
|
||||
print("=== Test complete (read-only) ===")
|
||||
return
|
||||
|
||||
write_url = f"{base_url}/api/v1/prom/remote/write"
|
||||
|
||||
def encode_varint(n):
|
||||
"""Encode a varint"""
|
||||
result = []
|
||||
while n > 127:
|
||||
result.append((n & 0x7F) | 0x80)
|
||||
n >>= 7
|
||||
result.append(n)
|
||||
return bytes(result)
|
||||
|
||||
def encode_string(field_num, s):
|
||||
"""Encode a string field in protobuf"""
|
||||
data = s.encode('utf-8')
|
||||
tag = (field_num << 3) | 2
|
||||
return bytes([tag]) + encode_varint(len(data)) + data
|
||||
|
||||
def encode_double(field_num, value):
|
||||
"""Encode a double field in protobuf"""
|
||||
tag = (field_num << 3) | 1
|
||||
return bytes([tag]) + struct.pack('<d', value)
|
||||
|
||||
def encode_int64(field_num, value):
|
||||
"""Encode an int64 field in protobuf (as varint)"""
|
||||
tag = (field_num << 3) | 0
|
||||
return bytes([tag]) + encode_varint(value)
|
||||
|
||||
def encode_label(name, value):
|
||||
"""Encode a single Label message"""
|
||||
return encode_string(1, name) + encode_string(2, value)
|
||||
|
||||
def write_metric(name, value, labels_dict):
|
||||
"""Write a metric with custom labels"""
|
||||
ts_ms = int(time.time() * 1000)
|
||||
|
||||
# Build all labels as repeated Label messages
|
||||
labels_data = b''
|
||||
|
||||
# __name__ label first
|
||||
labels_data += bytes([0x0a]) + encode_varint(len(encode_label("__name__", name))) + encode_label("__name__", name)
|
||||
|
||||
# Then custom labels
|
||||
for k, v in labels_dict.items():
|
||||
label_msg = encode_label(k, v)
|
||||
labels_data += bytes([0x0a]) + encode_varint(len(label_msg)) + label_msg
|
||||
|
||||
# Build Sample (field 2 in TimeSeries)
|
||||
sample = encode_double(1, float(value)) + encode_int64(2, ts_ms)
|
||||
|
||||
# Build TimeSeries
|
||||
ts_encoded = labels_data + bytes([0x12]) + encode_varint(len(sample)) + sample
|
||||
|
||||
# Build WriteRequest
|
||||
write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
|
||||
|
||||
# Compress with snappy
|
||||
compressed = snappy.compress(write_req)
|
||||
|
||||
headers = {
|
||||
"Content-Encoding": "snappy",
|
||||
"Content-Type": "application/x-protobuf",
|
||||
"X-Prometheus-Remote-Write-Version": "0.1.0"
|
||||
}
|
||||
|
||||
resp = requests.post(write_url, data=compressed, headers=headers, auth=auth, timeout=10)
|
||||
return resp.status_code
|
||||
|
||||
# Write test metrics with tenant labels
|
||||
print()
|
||||
tenants = [
|
||||
{"tenant": "test-tenant", "service": "api", "env": "test"},
|
||||
]
|
||||
|
||||
ts = int(time.time())
|
||||
for labels in tenants:
|
||||
metric_name = f"test_metric_{ts}"
|
||||
metric_value = random.randint(1, 100)
|
||||
|
||||
status = write_metric(metric_name, metric_value, labels)
|
||||
print(f"✓ Wrote: {metric_name} = {metric_value}")
|
||||
print(f" Labels: tenant={labels.get('tenant')}, service={labels.get('service')}, env={labels.get('env')}")
|
||||
|
||||
# Wait and query back
|
||||
time.sleep(2)
|
||||
|
||||
print()
|
||||
print("=== Read Back Test ===")
|
||||
try:
|
||||
resp = requests.get(query_url, params={"query": metric_name}, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
data = result.get("data", {}).get("result", [])
|
||||
if data:
|
||||
print(f"✓ Metric found!")
|
||||
for series in data:
|
||||
metric = series.get("metric", {})
|
||||
values = series.get("values", series.get("value", []))
|
||||
print(f" Labels: {metric}")
|
||||
print(f" Values: {values}")
|
||||
else:
|
||||
print(f"✗ Metric not found (may take a moment to index)")
|
||||
else:
|
||||
print(f"✗ Query failed: {resp.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Query failed: {e}")
|
||||
|
||||
print()
|
||||
print("=== Multi-Tenancy Query Examples ===")
|
||||
print()
|
||||
print("Query by tenant:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{tenant=\"test-tenant\"}}'")
|
||||
print()
|
||||
print("Query by service:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{service=\"api\"}}'")
|
||||
print()
|
||||
print("Query by env:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{env=\"test\"}}'")
|
||||
print()
|
||||
|
||||
print("=== Test complete ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,93 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Simple M3DB connectivity test
|
||||
# Usage: ./test-metrics.sh <BASE_URL> [USERNAME] [PASSWORD]
|
||||
#
|
||||
# Examples:
|
||||
# ./test-metrics.sh https://m3db.vultrlabs.dev example example
|
||||
# ./test-metrics.sh http://192.168.1.100:7201
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
BASE_URL="${1:-}"
|
||||
USERNAME="${2:-}"
|
||||
PASSWORD="${3:-}"
|
||||
|
||||
if [ -z "$BASE_URL" ]; then
|
||||
echo "Usage: $0 <BASE_URL> [USERNAME] [PASSWORD]"
|
||||
echo "Example: $0 https://m3db.vultrlabs.dev example example"
|
||||
echo " $0 http://192.168.1.100:7201"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Remove trailing slash if present
|
||||
BASE_URL="${BASE_URL%/}"
|
||||
|
||||
# Build auth flag if credentials provided
|
||||
AUTH_FLAG=""
|
||||
if [ -n "$USERNAME" ] && [ -n "$PASSWORD" ]; then
|
||||
AUTH_FLAG="-u ${USERNAME}:${PASSWORD}"
|
||||
fi
|
||||
|
||||
echo "=== M3DB Connectivity Test ==="
|
||||
echo "Target: ${BASE_URL}"
|
||||
if [ -n "$AUTH_FLAG" ]; then
|
||||
echo "Auth: ${USERNAME}:***"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Health check
|
||||
echo "1. Coordinator Health"
|
||||
if curl -sf $AUTH_FLAG "${BASE_URL}/health" > /dev/null 2>&1; then
|
||||
echo " ✓ Healthy"
|
||||
else
|
||||
echo " ✗ Unhealthy or unreachable"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Placement
|
||||
echo ""
|
||||
echo "2. Placement (cluster topology)"
|
||||
PLACEMENT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
|
||||
INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||
if [ "$INSTANCE_COUNT" -gt 0 ]; then
|
||||
echo " ✓ $INSTANCE_COUNT instances in placement"
|
||||
echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
|
||||
else
|
||||
echo " ✗ No placement configured (run init job)"
|
||||
fi
|
||||
|
||||
# Namespaces
|
||||
echo ""
|
||||
echo "3. Namespaces (retention policies)"
|
||||
NAMESPACES=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
|
||||
NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||
if [ "$NS_COUNT" -gt 0 ]; then
|
||||
echo " ✓ $NS_COUNT namespaces configured"
|
||||
echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
|
||||
else
|
||||
echo " ✗ No namespaces configured (run init job)"
|
||||
fi
|
||||
|
||||
# Query test
|
||||
echo ""
|
||||
echo "4. Query Test (PromQL)"
|
||||
QUERY_RESULT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
|
||||
STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||||
if [ "$STATUS" = "success" ]; then
|
||||
RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0")
|
||||
echo " ✓ Query returned: $RESULT_COUNT series"
|
||||
else
|
||||
echo " ✗ Query failed"
|
||||
fi
|
||||
|
||||
# Write test (requires protobuf + snappy, so just note it)
|
||||
echo ""
|
||||
echo "5. Write Test"
|
||||
echo " Note: Prometheus remote_write requires protobuf + snappy encoding."
|
||||
echo " Use test-metrics.py for full write/read verification."
|
||||
echo " Install: pip install python-snappy requests"
|
||||
|
||||
echo ""
|
||||
echo "=== Test Complete ==="
|
||||
@@ -7,5 +7,5 @@ resources:
|
||||
- 02-etcd.yaml
|
||||
- 03-configmaps.yaml
|
||||
- 04-m3dbnode.yaml
|
||||
- 05-m3coordinator.yaml
|
||||
- 05-m3coordinator-deployment.yaml
|
||||
- 06-init-and-pdb.yaml
|
||||
|
||||
Reference in New Issue
Block a user