154 lines
4.7 KiB
Python
154 lines
4.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Massive backfill: March 12 - April 1, 2026
|
||
|
|
Writes ONLY to 'default' namespace (raw data)
|
||
|
|
Overlapping chunks - no gaps!
|
||
|
|
"""
|
||
|
|
import struct
|
||
|
|
import urllib.request
|
||
|
|
import urllib.error
|
||
|
|
import urllib.parse
|
||
|
|
import json
|
||
|
|
import ssl
|
||
|
|
import snappy
|
||
|
|
import base64
|
||
|
|
import time
|
||
|
|
|
||
|
|
# Read credentials from environment (see .env)
|
||
|
|
import os
|
||
|
|
MIMIR_URL = "https://metrics.vultrlabs.com/prometheus"
|
||
|
|
MIMIR_USER = os.environ.get("MIMIR_USERNAME", "REPLACE_WITH_MIMIR_USERNAME")
|
||
|
|
MIMIR_PASS = os.environ.get("MIMIR_PASSWORD", "REPLACE_WITH_MIMIR_PASSWORD")
|
||
|
|
M3DB_URL = "http://m3coordinator.m3db.svc.cluster.local:7201"
|
||
|
|
|
||
|
|
# March 12 to April 1 (full range)
|
||
|
|
START_TS = 1773273600 # March 12 00:00 UTC
|
||
|
|
END_TS = 1775052000 # April 1 14:00 UTC
|
||
|
|
CHUNK_HOURS = 4 # 4-hour chunks
|
||
|
|
OVERLAP_MINUTES = 30 # 30-min overlap between chunks
|
||
|
|
STEP = "10s"
|
||
|
|
|
||
|
|
METRICS = [
|
||
|
|
"vllm:prompt_tokens_total",
|
||
|
|
"vllm:generation_tokens_total",
|
||
|
|
"DCGM_FI_DEV_GPU_UTIL",
|
||
|
|
]
|
||
|
|
|
||
|
|
def enc(v):
|
||
|
|
b = v & 0x7f
|
||
|
|
v >>= 7
|
||
|
|
r = b""
|
||
|
|
while v:
|
||
|
|
r += bytes([0x80 | b])
|
||
|
|
b = v & 0x7f
|
||
|
|
v >>= 7
|
||
|
|
return r + bytes([b])
|
||
|
|
|
||
|
|
def es(f, d): return enc((f<<3)|2) + enc(len(d)) + d
|
||
|
|
def ed(f, v): return enc((f<<3)|1) + struct.pack("<d", v)
|
||
|
|
|
||
|
|
def build_ts(labels, samples):
|
||
|
|
ts = b""
|
||
|
|
for n, v in labels.items():
|
||
|
|
l = es(1, n.encode()) + es(2, v.encode())
|
||
|
|
ts += enc((1<<3)|2) + enc(len(l)) + l
|
||
|
|
for t_ms, val in samples:
|
||
|
|
s = ed(1, val) + enc((2<<3)|0) + enc(t_ms)
|
||
|
|
ts += enc((2<<3)|2) + enc(len(s)) + s
|
||
|
|
return ts
|
||
|
|
|
||
|
|
def ssl_ctx():
|
||
|
|
ctx = ssl.create_default_context()
|
||
|
|
ctx.check_hostname = False
|
||
|
|
ctx.verify_mode = ssl.CERT_NONE
|
||
|
|
return ctx
|
||
|
|
|
||
|
|
def mimir_req(path):
|
||
|
|
auth = base64.b64encode(f"{MIMIR_USER}:{MIMIR_PASS}".encode()).decode()
|
||
|
|
url = f"{MIMIR_URL}{path}"
|
||
|
|
req = urllib.request.Request(url)
|
||
|
|
req.add_header("Authorization", f"Basic {auth}")
|
||
|
|
resp = urllib.request.urlopen(req, context=ssl_ctx(), timeout=300)
|
||
|
|
return json.loads(resp.read().decode())
|
||
|
|
|
||
|
|
def write_m3db(data):
|
||
|
|
c = snappy.compress(data)
|
||
|
|
req = urllib.request.Request(f"{M3DB_URL}/api/v1/prom/remote/write", c, method="POST")
|
||
|
|
req.add_header("Content-Type", "application/x-protobuf")
|
||
|
|
req.add_header("X-Prometheus-Remote-Write-Version", "0.1.0")
|
||
|
|
req.add_header("Content-Encoding", "snappy")
|
||
|
|
# TARGET ONLY DEFAULT NAMESPACE
|
||
|
|
req.add_header("X-M3-Namespace", "default")
|
||
|
|
try:
|
||
|
|
urllib.request.urlopen(req, timeout=300)
|
||
|
|
return True
|
||
|
|
except urllib.error.HTTPError as e:
|
||
|
|
err = e.read().decode()[:200]
|
||
|
|
print(f" WRITE ERROR {e.code}: {err}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
print(f"MASSIVE BACKFILL - DEFAULT NAMESPACE ONLY")
|
||
|
|
print(f"Range: March 12 - April 1, 2026")
|
||
|
|
print(f"Chunk size: {CHUNK_HOURS}h, Overlap: {OVERLAP_MINUTES}m")
|
||
|
|
print(f"Metrics: {len(METRICS)}")
|
||
|
|
print("="*60)
|
||
|
|
|
||
|
|
total_samples = 0
|
||
|
|
start_time = time.time()
|
||
|
|
chunk_seconds = CHUNK_HOURS * 3600
|
||
|
|
overlap_seconds = OVERLAP_MINUTES * 60
|
||
|
|
|
||
|
|
for metric in METRICS:
|
||
|
|
print(f"\n{metric}:")
|
||
|
|
metric_samples = 0
|
||
|
|
chunk_num = 0
|
||
|
|
|
||
|
|
chunk_start = START_TS
|
||
|
|
while chunk_start < END_TS:
|
||
|
|
chunk_end = min(chunk_start + chunk_seconds, END_TS)
|
||
|
|
chunk_num += 1
|
||
|
|
|
||
|
|
path = f"/api/v1/query_range?query={urllib.parse.quote(metric)}&start={chunk_start}&end={chunk_end}&step={STEP}"
|
||
|
|
|
||
|
|
try:
|
||
|
|
data = mimir_req(path)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Chunk {chunk_num}: QUERY FAILED: {e}")
|
||
|
|
chunk_start = chunk_end - overlap_seconds
|
||
|
|
continue
|
||
|
|
|
||
|
|
if data.get("status") != "success":
|
||
|
|
print(f" Chunk {chunk_num}: status={data.get('status')}")
|
||
|
|
chunk_start = chunk_end - overlap_seconds
|
||
|
|
continue
|
||
|
|
|
||
|
|
series = data["data"]["result"]
|
||
|
|
samples = sum(len(s["values"]) for s in series)
|
||
|
|
|
||
|
|
if samples == 0:
|
||
|
|
chunk_start = chunk_end - overlap_seconds
|
||
|
|
continue
|
||
|
|
|
||
|
|
wr = b""
|
||
|
|
for s in series:
|
||
|
|
labels = dict(s["metric"])
|
||
|
|
labels["cluster"] = "serverless-inference-cluster"
|
||
|
|
pts = [(int(float(v[0])*1000), float(v[1])) for v in s["values"]]
|
||
|
|
ts = build_ts(labels, pts)
|
||
|
|
wr += enc((1<<3)|2) + enc(len(ts)) + ts
|
||
|
|
|
||
|
|
if write_m3db(wr):
|
||
|
|
metric_samples += samples
|
||
|
|
hrs = (chunk_end - chunk_start) / 3600
|
||
|
|
print(f" Chunk {chunk_num}: {samples:,} samples ({hrs:.1f}h) ✓", flush=True)
|
||
|
|
|
||
|
|
# Next chunk starts with overlap
|
||
|
|
chunk_start = chunk_end - overlap_seconds
|
||
|
|
|
||
|
|
total_samples += metric_samples
|
||
|
|
print(f" TOTAL {metric}: {metric_samples:,} samples")
|
||
|
|
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
print("="*60)
|
||
|
|
print(f"DONE! {total_samples:,} samples in {elapsed:.1f}s")
|