Files
m3db-vke-setup/backfill/backfill-massive.py

154 lines
4.7 KiB
Python
Raw Normal View History

2026-04-01 15:21:10 +00:00
#!/usr/bin/env python3
"""
Massive backfill: March 12 - April 1, 2026
Writes ONLY to 'default' namespace (raw data)
Overlapping chunks - no gaps!
"""
import struct
import urllib.request
import urllib.error
import urllib.parse
import json
import ssl
import snappy
import base64
import time
# Read credentials from environment (see .env)
import os
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
# March 12 to April 1 (full range)
START_TS = 1773273600 # March 12 00:00 UTC
END_TS = 1775052000 # April 1 14:00 UTC
CHUNK_HOURS = 4 # 4-hour chunks
OVERLAP_MINUTES = 30 # 30-min overlap between chunks
STEP = "10s"
METRICS = [
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"DCGM_FI_DEV_GPU_UTIL",
]
def enc(v):
b = v & 0x7f
v >>= 7
r = b""
while v:
r += bytes([0x80 | b])
b = v & 0x7f
v >>= 7
return r + bytes([b])
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
def build_ts(labels, samples):
ts = b""
for n, v in labels.items():
l = es(1, n.encode()) + es(2, v.encode())
ts += enc((1<<3)|2) + enc(len(l)) + l
for t_ms, val in samples:
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
ts += enc((2<<3)|2) + enc(len(s)) + s
return ts
def ssl_ctx():
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
return ctx
def mimir_req(path):
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
url = f"{MIMIR_URL}{path}"
req = urllib.request.Request(url)
req.add_header("Authorization", f"Basic {auth}")
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
return json.loads(resp.read().decode())
def write_m3db(data):
c = snappy.compress(data)
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
req.add_header("Content-Type", "application/x-protobuf")
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
req.add_header("Content-Encoding", "snappy")
# TARGET ONLY DEFAULT NAMESPACE
req.add_header("X-M3-Namespace", "default")
try:
urllib.request.urlopen(req, timeout=300)
return True
except urllib.error.HTTPError as e:
err = e.read().decode()[:200]
print(f" WRITE ERROR {e.code}: {err}")
return False
print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
print(f"Range: March 12 - April 1, 2026")
print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
print(f"Metrics: {len(METRICS)}")
print("="*60)
total_samples = 0
start_time = time.time()
chunk_seconds = CHUNK_HOURS * 3600
overlap_seconds = OVERLAP_MINUTES * 60
for metric in METRICS:
print(f"\n{metric}:")
metric_samples = 0
chunk_num = 0
chunk_start = START_TS
while chunk_start < END_TS:
chunk_end = min(chunk_start + chunk_seconds, END_TS)
chunk_num += 1
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
try:
data = mimir_req(path)
except Exception as e:
print(f" Chunk {chunk_num}: QUERY FAILED: {e}")
chunk_start = chunk_end - overlap_seconds
continue
if data.get("status") != "success":
print(f" Chunk {chunk_num}: status={data.get('status')}")
chunk_start = chunk_end - overlap_seconds
continue
series = data["data"]["result"]
samples = sum(len(s["values"]) for s in series)
if samples == 0:
chunk_start = chunk_end - overlap_seconds
continue
wr = b""
for s in series:
labels = dict(s["metric"])
labels["cluster"] = "serverless-inference-cluster"
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
ts = build_ts(labels, pts)
wr += enc((1<<3)|2) + enc(len(ts)) + ts
if write_m3db(wr):
metric_samples += samples
hrs = (chunk_end - chunk_start) / 3600
print(f" Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
# Next chunk starts with overlap
chunk_start = chunk_end - overlap_seconds
total_samples += metric_samples
print(f" TOTAL {metric}: {metric_samples:,} samples")
elapsed = time.time() - start_time
print("="*60)
print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")