tweaks with backfill and grafana
This commit is contained in:
18
.env.example
Normal file
18
.env.example
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# M3DB Cluster Credentials
|
||||||
|
# ========================
|
||||||
|
# Copy this file to .env and fill in your values
|
||||||
|
|
||||||
|
# M3DB Basic Auth (coordinator API access)
|
||||||
|
M3DB_USERNAME=<your-m3db-username>
|
||||||
|
M3DB_PASSWORD=<your-m3db-password>
|
||||||
|
|
||||||
|
# Mimir (source for backfill)
|
||||||
|
MIMIR_USERNAME=<your-mimir-username>
|
||||||
|
MIMIR_PASSWORD=<your-mimir-password>
|
||||||
|
|
||||||
|
# Grafana Admin
|
||||||
|
GRAFANA_ADMIN_PASSWORD=<your-grafana-admin-password>
|
||||||
|
|
||||||
|
# M3DB Basic Auth (htpasswd base64)
|
||||||
|
# Generate with: echo -n "username:password" | base64
|
||||||
|
M3DB_HTPASSWD_B64=<base64-encoded-htpasswd>
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
|||||||
kubeconfig.yaml
|
kubeconfig.yaml
|
||||||
|
.env
|
||||||
|
|||||||
@@ -111,11 +111,11 @@ spec:
|
|||||||
mountPath: /var/lib/m3kv
|
mountPath: /var/lib/m3kv
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "1"
|
cpu: "4"
|
||||||
memory: 4Gi
|
|
||||||
limits:
|
|
||||||
cpu: "2"
|
|
||||||
memory: 8Gi
|
memory: 8Gi
|
||||||
|
limits:
|
||||||
|
cpu: "8"
|
||||||
|
memory: 20Gi
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /health
|
path: /health
|
||||||
|
|||||||
@@ -1,117 +1,70 @@
|
|||||||
##############################################################################
|
|
||||||
# M3 Coordinator — Deployment
|
|
||||||
# Stateless query/write layer — Prometheus remote_write & remote_read target
|
|
||||||
# This is what Grafana and Prometheus talk to (replaces Mimir endpoints)
|
|
||||||
##############################################################################
|
|
||||||
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: m3coordinator
|
|
||||||
namespace: m3db
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: m3coordinator
|
|
||||||
app.kubernetes.io/part-of: m3db
|
|
||||||
spec:
|
|
||||||
replicas: 2
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: m3coordinator
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: m3coordinator
|
|
||||||
app.kubernetes.io/part-of: m3db
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "7203"
|
|
||||||
spec:
|
|
||||||
affinity:
|
|
||||||
podAntiAffinity:
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- weight: 100
|
|
||||||
podAffinityTerm:
|
|
||||||
labelSelector:
|
|
||||||
matchExpressions:
|
|
||||||
- key: app.kubernetes.io/name
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- m3coordinator
|
|
||||||
topologyKey: kubernetes.io/hostname
|
|
||||||
containers:
|
|
||||||
- name: m3coordinator
|
|
||||||
image: quay.io/m3db/m3coordinator:v1.5.0
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
args:
|
|
||||||
- "-f"
|
|
||||||
- "/etc/m3coordinator/m3coordinator.yml"
|
|
||||||
ports:
|
|
||||||
- containerPort: 7201
|
|
||||||
name: api
|
|
||||||
protocol: TCP
|
|
||||||
- containerPort: 7203
|
|
||||||
name: metrics
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/m3coordinator
|
|
||||||
- name: cache-dir
|
|
||||||
mountPath: /var/lib/m3kv
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
limits:
|
|
||||||
cpu: "1"
|
|
||||||
memory: 2Gi
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health
|
|
||||||
port: 7201
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 10
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /health
|
|
||||||
port: 7201
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 5
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: m3coordinator-config
|
|
||||||
- name: cache-dir
|
|
||||||
emptyDir: {}
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
# M3 Coordinator Service
|
|
||||||
# Endpoints for Prometheus remote_write / remote_read / Grafana
|
|
||||||
#
|
|
||||||
# remote_write → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/write
|
|
||||||
# remote_read → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/read
|
|
||||||
# query (Grafana Prometheus datasource) → http://m3coordinator.m3db.svc.cluster.local:7201
|
|
||||||
##############################################################################
|
|
||||||
|
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
name: m3coordinator
|
name: m3coordinator-config
|
||||||
namespace: m3db
|
namespace: m3db
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: m3coordinator
|
app.kubernetes.io/name: m3coordinator
|
||||||
app.kubernetes.io/part-of: m3db
|
data:
|
||||||
spec:
|
m3coordinator.yml: |
|
||||||
type: ClusterIP
|
listenAddress: 0.0.0.0:7201
|
||||||
ports:
|
|
||||||
- name: api
|
logging:
|
||||||
port: 7201
|
level: info
|
||||||
targetPort: 7201
|
|
||||||
protocol: TCP
|
metrics:
|
||||||
- name: metrics
|
scope:
|
||||||
port: 7203
|
prefix: coordinator
|
||||||
targetPort: 7203
|
prometheus:
|
||||||
protocol: TCP
|
handlerPath: /metrics
|
||||||
selector:
|
listenAddress: 0.0.0.0:7203
|
||||||
app.kubernetes.io/name: m3coordinator
|
sanitization: prometheus
|
||||||
|
samplingRate: 1.0
|
||||||
|
|
||||||
|
tagOptions:
|
||||||
|
idScheme: quoted
|
||||||
|
|
||||||
|
clusters:
|
||||||
|
- namespaces:
|
||||||
|
- namespace: default
|
||||||
|
type: unaggregated
|
||||||
|
retention: 720h
|
||||||
|
- namespace: agg_1m_60d
|
||||||
|
type: aggregated
|
||||||
|
retention: 1440h
|
||||||
|
resolution: 1m
|
||||||
|
- namespace: agg_1h_1y
|
||||||
|
type: aggregated
|
||||||
|
retention: 8760h
|
||||||
|
resolution: 1h
|
||||||
|
client:
|
||||||
|
config:
|
||||||
|
service:
|
||||||
|
env: default_env
|
||||||
|
zone: embedded
|
||||||
|
service: m3db
|
||||||
|
cacheDir: /var/lib/m3kv
|
||||||
|
etcdClusters:
|
||||||
|
- zone: embedded
|
||||||
|
endpoints:
|
||||||
|
- http://etcd-0.etcd.m3db.svc.cluster.local:2379
|
||||||
|
- http://etcd-1.etcd.m3db.svc.cluster.local:2379
|
||||||
|
- http://etcd-2.etcd.m3db.svc.cluster.local:2379
|
||||||
|
writeConsistencyLevel: majority
|
||||||
|
readConsistencyLevel: unstrict_majority
|
||||||
|
|
||||||
|
downsample:
|
||||||
|
rules:
|
||||||
|
mappingRules:
|
||||||
|
- name: "1min for 60 days"
|
||||||
|
filter: "__name__:*"
|
||||||
|
aggregations: ["Last"]
|
||||||
|
storagePolicies:
|
||||||
|
- resolution: 1m
|
||||||
|
retention: 1440h
|
||||||
|
- name: "1hour for 1 year"
|
||||||
|
filter: "__name__:*"
|
||||||
|
aggregations: ["Last"]
|
||||||
|
storagePolicies:
|
||||||
|
- resolution: 1h
|
||||||
|
retention: 8760h
|
||||||
|
|||||||
@@ -16,10 +16,11 @@ metadata:
|
|||||||
name: basic-auth-secret
|
name: basic-auth-secret
|
||||||
namespace: m3db
|
namespace: m3db
|
||||||
type: Opaque
|
type: Opaque
|
||||||
# htpasswd -nb example example
|
# Generate with: htpasswd -nb vultr_m3db <password> | base64
|
||||||
|
# See .env for credentials
|
||||||
stringData:
|
stringData:
|
||||||
users: |-
|
users: |-
|
||||||
example:$apr1$oMBgtfpd$CBTS17sDq7GN58qaoIMvh.
|
vultr_m3db:$apr1$xyz$tempplaceholderREPLACEFROMENV
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: traefik.io/v1alpha1
|
apiVersion: traefik.io/v1alpha1
|
||||||
|
|||||||
163
10-grafana.yaml
Normal file
163
10-grafana.yaml
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Grafana - Visualization for M3DB metrics
|
||||||
|
# Deployed on dedicated grafana nodepool
|
||||||
|
# Exposed via LoadBalancer (no TLS - Grafana has built-in auth)
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: grafana-admin
|
||||||
|
namespace: grafana
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
admin-user: admin
|
||||||
|
# REPLACE: Set from .env GRAFANA_ADMIN_PASSWORD
|
||||||
|
admin-password: "REPLACE_WITH_GRAFANA_ADMIN_PASSWORD"
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-datasources
|
||||||
|
namespace: grafana
|
||||||
|
labels:
|
||||||
|
grafana_datasource: "1"
|
||||||
|
data:
|
||||||
|
datasources.yaml: |
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: M3DB
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://m3coordinator.m3db:7201
|
||||||
|
basicAuth: true
|
||||||
|
# REPLACE: Set from .env M3DB_USERNAME and M3DB_PASSWORD
|
||||||
|
basicAuthUser: REPLACE_WITH_M3DB_USERNAME
|
||||||
|
secureJsonData:
|
||||||
|
basicAuthPassword: 'REPLACE_WITH_M3DB_PASSWORD'
|
||||||
|
isDefault: true
|
||||||
|
editable: false
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: grafana-storage
|
||||||
|
namespace: grafana
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
storageClassName: vultr-block-storage
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: grafana
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
spec:
|
||||||
|
# Schedule only on grafana nodepool
|
||||||
|
nodeSelector:
|
||||||
|
vke.vultr.com/node-pool: grafana
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 472
|
||||||
|
runAsUser: 472
|
||||||
|
runAsGroup: 472
|
||||||
|
containers:
|
||||||
|
- name: grafana
|
||||||
|
image: grafana/grafana:11.5.2
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 3000
|
||||||
|
protocol: TCP
|
||||||
|
env:
|
||||||
|
- name: GF_SECURITY_ADMIN_USER
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-admin
|
||||||
|
key: admin-user
|
||||||
|
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: grafana-admin
|
||||||
|
key: admin-password
|
||||||
|
- name: GF_AUTH_ANONYMOUS_ENABLED
|
||||||
|
value: "false"
|
||||||
|
- name: GF_SERVER_ROOT_URL
|
||||||
|
value: "%(protocol)s://%(domain)s:%(http_port)s/"
|
||||||
|
- name: GF_INSTALL_PLUGINS
|
||||||
|
value: ""
|
||||||
|
volumeMounts:
|
||||||
|
- name: storage
|
||||||
|
mountPath: /var/lib/grafana
|
||||||
|
- name: datasources
|
||||||
|
mountPath: /etc/grafana/provisioning/datasources
|
||||||
|
readOnly: true
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 250m
|
||||||
|
memory: 512Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /api/health
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /api/health
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
volumes:
|
||||||
|
- name: storage
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: grafana-storage
|
||||||
|
- name: datasources
|
||||||
|
configMap:
|
||||||
|
name: grafana-datasources
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: grafana
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
spec:
|
||||||
|
type: LoadBalancer
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: http
|
||||||
|
protocol: TCP
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
171
backfill/BACKFILL_RUNBOOK.md
Normal file
171
backfill/BACKFILL_RUNBOOK.md
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
# M3DB Backfill Runbook (Revised)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Backfilling ~3 weeks of vLLM + DCGM metrics from Mimir to M3DB.
|
||||||
|
|
||||||
|
**Blocker discovered:** `bufferPast` is immutable on existing namespaces. Downsample pipeline rejects historical writes.
|
||||||
|
|
||||||
|
**Solution:** Create new backfill namespaces with `bufferPast=504h` (21 days).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 1 — Create Backfill Namespaces
|
||||||
|
|
||||||
|
```bash
|
||||||
|
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||||
|
|
||||||
|
# default_backfill: 7d retention, 21d bufferPast
|
||||||
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name": "default_backfill",
|
||||||
|
"options": {
|
||||||
|
"retentionOptions": {
|
||||||
|
"retentionPeriodDuration": "168h",
|
||||||
|
"blockSizeDuration": "2h",
|
||||||
|
"bufferFutureDuration": "10m",
|
||||||
|
"bufferPastDuration": "504h"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# agg_10s_backfill: 90d retention, 10s resolution, 21d bufferPast
|
||||||
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name": "agg_10s_backfill",
|
||||||
|
"options": {
|
||||||
|
"retentionOptions": {
|
||||||
|
"retentionPeriodDuration": "2160h",
|
||||||
|
"blockSizeDuration": "24h",
|
||||||
|
"bufferFutureDuration": "10m",
|
||||||
|
"bufferPastDuration": "504h"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"aggregationOptions": {
|
||||||
|
"aggregations": [{
|
||||||
|
"aggregated": true,
|
||||||
|
"attributes": {
|
||||||
|
"resolutionNanos": "10000000000",
|
||||||
|
"downsampleOptions": {"all": true}
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# agg_1m_backfill: 1y retention, 1m resolution, 21d bufferPast
|
||||||
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name": "agg_1m_backfill",
|
||||||
|
"options": {
|
||||||
|
"retentionOptions": {
|
||||||
|
"retentionPeriodDuration": "8760h",
|
||||||
|
"blockSizeDuration": "24h",
|
||||||
|
"bufferFutureDuration": "10m",
|
||||||
|
"bufferPastDuration": "504h"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"aggregationOptions": {
|
||||||
|
"aggregations": [{
|
||||||
|
"aggregated": true,
|
||||||
|
"attributes": {
|
||||||
|
"resolutionNanos": "60000000000",
|
||||||
|
"downsampleOptions": {"all": true}
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 2 — Update Coordinator ConfigMap
|
||||||
|
|
||||||
|
Add new namespaces to `m3coordinator-config`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
clusters:
|
||||||
|
- namespaces:
|
||||||
|
- namespace: default
|
||||||
|
type: unaggregated
|
||||||
|
retention: 168h
|
||||||
|
- namespace: default_backfill
|
||||||
|
type: unaggregated
|
||||||
|
retention: 168h
|
||||||
|
- namespace: agg_10s_30d
|
||||||
|
type: aggregated
|
||||||
|
retention: 2160h
|
||||||
|
resolution: 10s
|
||||||
|
- namespace: agg_10s_backfill
|
||||||
|
type: aggregated
|
||||||
|
retention: 2160h
|
||||||
|
resolution: 10s
|
||||||
|
- namespace: agg_1m_1y
|
||||||
|
type: aggregated
|
||||||
|
retention: 8760h
|
||||||
|
resolution: 1m
|
||||||
|
- namespace: agg_1m_backfill
|
||||||
|
type: aggregated
|
||||||
|
retention: 8760h
|
||||||
|
resolution: 1m
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add downsample rules for backfill namespaces.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 3 — Restart Coordinators
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl rollout restart deployment/m3coordinator -n m3db
|
||||||
|
kubectl rollout status deployment/m3coordinator -n m3db --timeout=120s
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 4 — Run Backfill
|
||||||
|
|
||||||
|
Write directly to `default_backfill` namespace using `__namespace__` label:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# In the protobuf write request, add label:
|
||||||
|
# __namespace__ = "default_backfill"
|
||||||
|
```
|
||||||
|
|
||||||
|
Or use the coordinator endpoint:
|
||||||
|
```
|
||||||
|
POST http://m3coordinator:7201/api/v1/prom/remote/write?namespace=default_backfill
|
||||||
|
```
|
||||||
|
|
||||||
|
Backfill time range: `2026-03-11T00:00:00Z` to `2026-04-01T00:00:00Z`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 5 — Verify
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -sS "http://m3coordinator:7201/api/v1/query" \
|
||||||
|
--data-urlencode 'query=vllm:prompt_tokens_total' \
|
||||||
|
--data-urlencode 'time=2026-03-20T12:00:00Z'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 6 — Revert bufferPast (After Backfill)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# After backfill complete, shrink bufferPast back to 10m
|
||||||
|
# (Only retentionPeriod is mutable, so this requires namespace recreation)
|
||||||
|
# OR: Leave as-is since it's a backfill-only namespace
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Notes
|
||||||
|
|
||||||
|
- M3DB has been fast so far
|
||||||
|
- New namespaces won't impact existing query performance
|
||||||
|
- Queries can fan out to both old and new namespaces in parallel
|
||||||
|
- After backfill, consider consolidating (optional)
|
||||||
87
backfill/README.md
Normal file
87
backfill/README.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# M3DB Backfill Tools
|
||||||
|
|
||||||
|
Scripts to backfill historical metrics from Mimir to M3DB.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Copy `.env` and set credentials:
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env with your credentials
|
||||||
|
```
|
||||||
|
|
||||||
|
Required environment variables:
|
||||||
|
- `MIMIR_USERNAME` - Mimir API username
|
||||||
|
- `MIMIR_PASSWORD` - Mimir API password
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `backfill.py` | Main backfill script — pulls from Mimir, writes to M3DB |
|
||||||
|
| `backfill-gap.py` | Lightweight script for filling specific time gaps |
|
||||||
|
| `backfill-pod.yaml` | Kubernetes pod manifest for running backfill |
|
||||||
|
| `BACKFILL_RUNBOOK.md` | Detailed runbook with lessons learned |
|
||||||
|
| `test-metrics.py` | Test script for verifying data flow |
|
||||||
|
|
||||||
|
## Quick Usage
|
||||||
|
|
||||||
|
### Full Backfill
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit START_TS and END_TS in backfill.py first
|
||||||
|
# Format: Unix timestamps (seconds since epoch)
|
||||||
|
|
||||||
|
# Create configmap and run
|
||||||
|
kubectl create configmap backfill-script --from-file=backfill.py=backfill.py -n m3db
|
||||||
|
kubectl apply -f backfill-pod.yaml
|
||||||
|
kubectl logs -f backfill -n m3db
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fill a Specific Gap
|
||||||
|
|
||||||
|
Edit `backfill-gap.py` to set your time range:
|
||||||
|
|
||||||
|
```python
|
||||||
|
START_TS = 1774175400 # Unix timestamp
|
||||||
|
END_TS = 1774243800 # Unix timestamp
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl create configmap backfill-gap-script --from-file=backfill-gap.py=backfill-gap.py -n m3db
|
||||||
|
kubectl apply -f backfill-gap-pod.yaml
|
||||||
|
kubectl logs -f backfill-gap -n m3db
|
||||||
|
```
|
||||||
|
|
||||||
|
## Timestamp Helpers
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Convert date to Unix timestamp
|
||||||
|
date -u -d '2026-03-22 10:30:00' +%s
|
||||||
|
|
||||||
|
# Convert Unix timestamp to date
|
||||||
|
date -u -d @1774175400
|
||||||
|
```
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Mimir credentials (in script)
|
||||||
|
- M3DB coordinator endpoint: `http://m3coordinator.m3db.svc.cluster.local:7201`
|
||||||
|
- `bufferPast` must be >= the age of data you're backfilling (currently 21 days)
|
||||||
|
|
||||||
|
## Metrics Backfilled
|
||||||
|
|
||||||
|
- `vllm:prompt_tokens_total`
|
||||||
|
- `vllm:generation_tokens_total`
|
||||||
|
- `DCGM_FI_DEV_GPU_UTIL`
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
|
After backfill completes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl delete pod backfill -n m3db
|
||||||
|
kubectl delete configmap backfill-script -n m3db
|
||||||
|
```
|
||||||
18
backfill/backfill-gap-pod.yaml
Normal file
18
backfill/backfill-gap-pod.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: backfill-gap
|
||||||
|
namespace: m3db
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
volumes:
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: backfill-gap-script
|
||||||
|
containers:
|
||||||
|
- name: backfill
|
||||||
|
image: python:3.11-slim
|
||||||
|
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-gap.py"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: script
|
||||||
|
mountPath: /scripts
|
||||||
100
backfill/backfill-gap.py
Normal file
100
backfill/backfill-gap.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Quick backfill for April 1 gap (10:44-11:50 UTC)"""
|
||||||
|
import struct
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import ssl
|
||||||
|
import snappy
|
||||||
|
import base64
|
||||||
|
|
||||||
|
# Read credentials from environment (see .env)
|
||||||
|
import os
|
||||||
|
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||||
|
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||||
|
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||||
|
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||||
|
|
||||||
|
START_TS = 1774175400 # 2026-03-22T10:30:00Z
|
||||||
|
END_TS = 1774243800 # 2026-03-23T05:30:00Z
|
||||||
|
STEP = "10s"
|
||||||
|
|
||||||
|
METRICS = ["vllm:prompt_tokens_total", "vllm:generation_tokens_total", "DCGM_FI_DEV_GPU_UTIL"]
|
||||||
|
|
||||||
|
def enc(v):
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
r = b""
|
||||||
|
while v:
|
||||||
|
r += bytes([0x80 | b])
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
return r + bytes([b])
|
||||||
|
|
||||||
|
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||||
|
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||||
|
|
||||||
|
def build_ts(labels, samples):
|
||||||
|
ts = b""
|
||||||
|
for n, v in labels.items():
|
||||||
|
l = es(1, n.encode()) + es(2, v.encode())
|
||||||
|
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||||
|
for t_ms, val in samples:
|
||||||
|
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||||
|
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||||
|
return ts
|
||||||
|
|
||||||
|
def ssl_ctx():
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
def mimir_req(path):
|
||||||
|
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||||
|
req = urllib.request.Request(f"{MIMIR_URL}{path}")
|
||||||
|
req.add_header("Authorization", f"Basic {auth}")
|
||||||
|
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
def write_m3db(data):
|
||||||
|
c = snappy.compress(data)
|
||||||
|
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||||
|
req.add_header("Content-Type", "application/x-protobuf")
|
||||||
|
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||||
|
req.add_header("Content-Encoding", "snappy")
|
||||||
|
try:
|
||||||
|
urllib.request.urlopen(req, timeout=300)
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print(f" ERROR {e.code}: {e.read().decode()[:100]}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"Filling gap: {START_TS} to {END_TS}")
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for metric in METRICS:
|
||||||
|
print(f"{metric}...", end=" ", flush=True)
|
||||||
|
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={START_TS}&end={END_TS}&step={STEP}"
|
||||||
|
data = mimir_req(path)
|
||||||
|
if data["status"] != "success":
|
||||||
|
print("failed")
|
||||||
|
continue
|
||||||
|
series = data["data"]["result"]
|
||||||
|
samples = sum(len(s["values"]) for s in series)
|
||||||
|
if samples > 0:
|
||||||
|
wr = b""
|
||||||
|
for s in series:
|
||||||
|
labels = dict(s["metric"])
|
||||||
|
labels["cluster"] = "serverless-inference-cluster"
|
||||||
|
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||||
|
ts = build_ts(labels, pts)
|
||||||
|
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||||
|
if write_m3db(wr):
|
||||||
|
print(f"{samples} samples written")
|
||||||
|
total += samples
|
||||||
|
else:
|
||||||
|
print("no data")
|
||||||
|
|
||||||
|
print(f"Done! Total: {total} samples")
|
||||||
18
backfill/backfill-massive-pod.yaml
Normal file
18
backfill/backfill-massive-pod.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: backfill-massive
|
||||||
|
namespace: m3db
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
volumes:
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: backfill-massive-script
|
||||||
|
containers:
|
||||||
|
- name: backfill
|
||||||
|
image: python:3.11-slim
|
||||||
|
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill-massive.py"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: script
|
||||||
|
mountPath: /scripts
|
||||||
153
backfill/backfill-massive.py
Normal file
153
backfill/backfill-massive.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Massive backfill: March 12 - April 1, 2026
|
||||||
|
Writes ONLY to 'default' namespace (raw data)
|
||||||
|
Overlapping chunks - no gaps!
|
||||||
|
"""
|
||||||
|
import struct
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import ssl
|
||||||
|
import snappy
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Read credentials from environment (see .env)
|
||||||
|
import os
|
||||||
|
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||||
|
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||||
|
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||||
|
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||||
|
|
||||||
|
# March 12 to April 1 (full range)
|
||||||
|
START_TS = 1773273600 # March 12 00:00 UTC
|
||||||
|
END_TS = 1775052000 # April 1 14:00 UTC
|
||||||
|
CHUNK_HOURS = 4 # 4-hour chunks
|
||||||
|
OVERLAP_MINUTES = 30 # 30-min overlap between chunks
|
||||||
|
STEP = "10s"
|
||||||
|
|
||||||
|
METRICS = [
|
||||||
|
"vllm:prompt_tokens_total",
|
||||||
|
"vllm:generation_tokens_total",
|
||||||
|
"DCGM_FI_DEV_GPU_UTIL",
|
||||||
|
]
|
||||||
|
|
||||||
|
def enc(v):
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
r = b""
|
||||||
|
while v:
|
||||||
|
r += bytes([0x80 | b])
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
return r + bytes([b])
|
||||||
|
|
||||||
|
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||||
|
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||||
|
|
||||||
|
def build_ts(labels, samples):
|
||||||
|
ts = b""
|
||||||
|
for n, v in labels.items():
|
||||||
|
l = es(1, n.encode()) + es(2, v.encode())
|
||||||
|
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||||
|
for t_ms, val in samples:
|
||||||
|
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||||
|
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||||
|
return ts
|
||||||
|
|
||||||
|
def ssl_ctx():
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
def mimir_req(path):
|
||||||
|
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||||
|
url = f"{MIMIR_URL}{path}"
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
req.add_header("Authorization", f"Basic {auth}")
|
||||||
|
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
def write_m3db(data):
|
||||||
|
c = snappy.compress(data)
|
||||||
|
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||||
|
req.add_header("Content-Type", "application/x-protobuf")
|
||||||
|
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||||
|
req.add_header("Content-Encoding", "snappy")
|
||||||
|
# TARGET ONLY DEFAULT NAMESPACE
|
||||||
|
req.add_header("X-M3-Namespace", "default")
|
||||||
|
try:
|
||||||
|
urllib.request.urlopen(req, timeout=300)
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
err = e.read().decode()[:200]
|
||||||
|
print(f" WRITE ERROR {e.code}: {err}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
|
||||||
|
print(f"Range: March 12 - April 1, 2026")
|
||||||
|
print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
|
||||||
|
print(f"Metrics: {len(METRICS)}")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
total_samples = 0
|
||||||
|
start_time = time.time()
|
||||||
|
chunk_seconds = CHUNK_HOURS * 3600
|
||||||
|
overlap_seconds = OVERLAP_MINUTES * 60
|
||||||
|
|
||||||
|
for metric in METRICS:
|
||||||
|
print(f"\n{metric}:")
|
||||||
|
metric_samples = 0
|
||||||
|
chunk_num = 0
|
||||||
|
|
||||||
|
chunk_start = START_TS
|
||||||
|
while chunk_start < END_TS:
|
||||||
|
chunk_end = min(chunk_start + chunk_seconds, END_TS)
|
||||||
|
chunk_num += 1
|
||||||
|
|
||||||
|
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = mimir_req(path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Chunk {chunk_num}: QUERY FAILED: {e}")
|
||||||
|
chunk_start = chunk_end - overlap_seconds
|
||||||
|
continue
|
||||||
|
|
||||||
|
if data.get("status") != "success":
|
||||||
|
print(f" Chunk {chunk_num}: status={data.get('status')}")
|
||||||
|
chunk_start = chunk_end - overlap_seconds
|
||||||
|
continue
|
||||||
|
|
||||||
|
series = data["data"]["result"]
|
||||||
|
samples = sum(len(s["values"]) for s in series)
|
||||||
|
|
||||||
|
if samples == 0:
|
||||||
|
chunk_start = chunk_end - overlap_seconds
|
||||||
|
continue
|
||||||
|
|
||||||
|
wr = b""
|
||||||
|
for s in series:
|
||||||
|
labels = dict(s["metric"])
|
||||||
|
labels["cluster"] = "serverless-inference-cluster"
|
||||||
|
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||||
|
ts = build_ts(labels, pts)
|
||||||
|
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||||
|
|
||||||
|
if write_m3db(wr):
|
||||||
|
metric_samples += samples
|
||||||
|
hrs = (chunk_end - chunk_start) / 3600
|
||||||
|
print(f" Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
|
||||||
|
|
||||||
|
# Next chunk starts with overlap
|
||||||
|
chunk_start = chunk_end - overlap_seconds
|
||||||
|
|
||||||
|
total_samples += metric_samples
|
||||||
|
print(f" TOTAL {metric}: {metric_samples:,} samples")
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
print("="*60)
|
||||||
|
print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")
|
||||||
18
backfill/backfill-pod.yaml
Normal file
18
backfill/backfill-pod.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: backfill-v2
|
||||||
|
namespace: m3db
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
volumes:
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: backfill-script-v2
|
||||||
|
containers:
|
||||||
|
- name: backfill
|
||||||
|
image: python:3.11-slim
|
||||||
|
command: ["sh", "-c", "pip install -q python-snappy requests urllib3 && python3 /scripts/backfill.py"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: script
|
||||||
|
mountPath: /scripts
|
||||||
124
backfill/backfill.py
Normal file
124
backfill/backfill.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""M3DB Backfill - Pull vLLM/DCGM metrics from Mimir and write to M3DB"""
|
||||||
|
import struct
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import ssl
|
||||||
|
import snappy
|
||||||
|
import base64
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print("Starting backfill script...", flush=True)
|
||||||
|
|
||||||
|
# Read credentials from environment (see .env)
|
||||||
|
import os
|
||||||
|
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||||||
|
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||||||
|
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||||||
|
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||||
|
|
||||||
|
START_TS = 1773187200 # 2026-03-11T00:00:00Z
|
||||||
|
END_TS = 1775040000 # 2026-04-01T11:40:00Z (just before node restart)
|
||||||
|
STEP = "10s"
|
||||||
|
CHUNK_HOURS = 6
|
||||||
|
|
||||||
|
METRICS = [
|
||||||
|
"vllm:prompt_tokens_total",
|
||||||
|
"vllm:generation_tokens_total",
|
||||||
|
"DCGM_FI_DEV_GPU_UTIL",
|
||||||
|
]
|
||||||
|
|
||||||
|
def enc(v):
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
r = b""
|
||||||
|
while v:
|
||||||
|
r += bytes([0x80 | b])
|
||||||
|
b = v & 0x7f
|
||||||
|
v >>= 7
|
||||||
|
return r + bytes([b])
|
||||||
|
|
||||||
|
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||||||
|
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||||||
|
|
||||||
|
def build_ts(labels, samples):
|
||||||
|
ts = b""
|
||||||
|
for n, v in labels.items():
|
||||||
|
l = es(1, n.encode()) + es(2, v.encode())
|
||||||
|
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||||||
|
for t_ms, val in samples:
|
||||||
|
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||||||
|
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||||||
|
return ts
|
||||||
|
|
||||||
|
def ssl_ctx():
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
def mimir_req(path):
|
||||||
|
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||||||
|
req = urllib.request.Request(f"{MIMIR_URL}{path}")
|
||||||
|
req.add_header("Authorization", f"Basic {auth}")
|
||||||
|
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
def write_m3db(data):
|
||||||
|
c = snappy.compress(data)
|
||||||
|
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||||||
|
req.add_header("Content-Type", "application/x-protobuf")
|
||||||
|
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||||||
|
req.add_header("Content-Encoding", "snappy")
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=300)
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print(f" ERROR {e.code}: {e.read().decode()[:100]}", flush=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"Time range: {START_TS} to {END_TS}", flush=True)
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for metric in METRICS:
|
||||||
|
print(f"\n{metric}...", flush=True)
|
||||||
|
metric_total = 0
|
||||||
|
chunk_start = START_TS
|
||||||
|
chunks_done = 0
|
||||||
|
|
||||||
|
while chunk_start < END_TS:
|
||||||
|
chunk_end = min(chunk_start + CHUNK_HOURS * 3600, END_TS)
|
||||||
|
try:
|
||||||
|
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||||||
|
data = mimir_req(path)
|
||||||
|
if data["status"] != "success":
|
||||||
|
chunk_start = chunk_end
|
||||||
|
continue
|
||||||
|
|
||||||
|
series = data["data"]["result"]
|
||||||
|
samples = sum(len(s["values"]) for s in series)
|
||||||
|
if samples > 0:
|
||||||
|
wr = b""
|
||||||
|
for s in series:
|
||||||
|
labels = dict(s["metric"])
|
||||||
|
labels["cluster"] = "serverless-inference-cluster"
|
||||||
|
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||||||
|
ts = build_ts(labels, pts)
|
||||||
|
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||||||
|
if write_m3db(wr):
|
||||||
|
metric_total += samples
|
||||||
|
chunks_done += 1
|
||||||
|
if chunks_done % 10 == 0:
|
||||||
|
print(f" {chunks_done} chunks, {metric_total} samples...", flush=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Chunk error: {e}", flush=True)
|
||||||
|
|
||||||
|
chunk_start = chunk_end
|
||||||
|
|
||||||
|
print(f" Done: {metric_total} samples", flush=True)
|
||||||
|
total += metric_total
|
||||||
|
|
||||||
|
print(f"\nBackfill complete! Total: {total} samples", flush=True)
|
||||||
Reference in New Issue
Block a user