Files
m3db-vke-setup/06-init-and-pdb.yaml
biondizzle 7ade5ecac8 Clean slate: 1h block sizes, remove backfill artifacts
- Changed all namespace block sizes to 1h (was 2h/12h/24h in manifests,
  30d+ in the live cluster due to backfill-era bufferPast hacks)
- Deleted entire backfill/ directory (scripts, pods, runbooks)
- Removed stale 05-m3coordinator.yaml (had backfill namespaces)
- Added 05-m3coordinator-deployment.yaml to kustomization
- Fixed init job health check (/health instead of /api/v1/services/m3db/health)
- Updated .env.example (removed Mimir credentials)
- Added 'Why Backfill Doesn't Work' section to README
2026-04-09 19:00:08 +00:00

223 lines
8.0 KiB
YAML

##############################################################################
# PodDisruptionBudgets — keep quorum during rolling updates
##############################################################################
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: m3dbnode-pdb
namespace: m3db
spec:
minAvailable: 2
selector:
matchLabels:
app.kubernetes.io/name: m3dbnode
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: etcd-pdb
namespace: m3db
spec:
minAvailable: 2
selector:
matchLabels:
app.kubernetes.io/name: etcd
---
##############################################################################
# Cluster Init Job
# Run ONCE after all m3dbnode pods are Ready to:
# 1. Create the placement (topology)
# 2. Create the namespaces (retention policies)
# 3. Wait for the cluster to initialize
#
# kubectl apply -f 06-init-and-pdb.yaml
# (then monitor with: kubectl logs -n m3db job/m3db-cluster-init)
##############################################################################
apiVersion: batch/v1
kind: Job
metadata:
name: m3db-cluster-init
namespace: m3db
labels:
app.kubernetes.io/name: m3db-init
app.kubernetes.io/part-of: m3db
spec:
backoffLimit: 5
ttlSecondsAfterFinished: 3600
template:
spec:
restartPolicy: OnFailure
containers:
- name: init
image: curlimages/curl:8.7.1
command:
- /bin/sh
- -exc
- |
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
echo "=== Waiting for coordinator to be healthy ==="
until curl -sf "${COORD}/health"; do
echo "Coordinator not ready yet, retrying in 5s..."
sleep 5
done
echo ""
echo "=== Creating M3DB placement ==="
curl -sSf -X POST "${COORD}/api/v1/services/m3db/placement/init" \
-H "Content-Type: application/json" \
-d '{
"num_shards": 64,
"replication_factor": 3,
"instances": [
{
"id": "m3dbnode-0",
"isolation_group": "zone-a",
"zone": "embedded",
"weight": 100,
"endpoint": "m3dbnode-0.m3dbnode.m3db.svc.cluster.local:9000",
"hostname": "m3dbnode-0",
"port": 9000
},
{
"id": "m3dbnode-1",
"isolation_group": "zone-b",
"zone": "embedded",
"weight": 100,
"endpoint": "m3dbnode-1.m3dbnode.m3db.svc.cluster.local:9000",
"hostname": "m3dbnode-1",
"port": 9000
},
{
"id": "m3dbnode-2",
"isolation_group": "zone-c",
"zone": "embedded",
"weight": 100,
"endpoint": "m3dbnode-2.m3dbnode.m3db.svc.cluster.local:9000",
"hostname": "m3dbnode-2",
"port": 9000
}
]
}'
echo ""
echo "=== Creating unaggregated namespace (48h retention) ==="
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-H "Content-Type: application/json" \
-d '{
"name": "default",
"options": {
"bootstrapEnabled": true,
"flushEnabled": true,
"writesToCommitLog": true,
"cleanupEnabled": true,
"snapshotEnabled": true,
"repairEnabled": false,
"retentionOptions": {
"retentionPeriodDuration": "48h",
"blockSizeDuration": "1h",
"bufferFutureDuration": "10m",
"bufferPastDuration": "10m"
},
"indexOptions": {
"enabled": true,
"blockSizeDuration": "1h"
}
}
}'
echo ""
echo "=== Creating aggregated namespace: 10s resolution, 30d retention ==="
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-H "Content-Type: application/json" \
-d '{
"name": "agg_10s_30d",
"options": {
"bootstrapEnabled": true,
"flushEnabled": true,
"writesToCommitLog": true,
"cleanupEnabled": true,
"snapshotEnabled": true,
"retentionOptions": {
"retentionPeriodDuration": "720h",
"blockSizeDuration": "1h",
"bufferFutureDuration": "10m",
"bufferPastDuration": "10m"
},
"indexOptions": {
"enabled": true,
"blockSizeDuration": "1h"
},
"aggregationOptions": {
"aggregations": [
{
"aggregated": true,
"attributes": {
"resolutionDuration": "10s"
}
}
]
}
}
}'
echo ""
echo "=== Creating aggregated namespace: 1m resolution, 1y retention ==="
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
-H "Content-Type: application/json" \
-d '{
"name": "agg_1m_1y",
"options": {
"bootstrapEnabled": true,
"flushEnabled": true,
"writesToCommitLog": true,
"cleanupEnabled": true,
"snapshotEnabled": true,
"retentionOptions": {
"retentionPeriodDuration": "8760h",
"blockSizeDuration": "1h",
"bufferFutureDuration": "10m",
"bufferPastDuration": "10m"
},
"indexOptions": {
"enabled": true,
"blockSizeDuration": "1h"
},
"aggregationOptions": {
"aggregations": [
{
"aggregated": true,
"attributes": {
"resolutionDuration": "1m"
}
}
]
}
}
}'
echo ""
echo "=== Waiting for namespace initialization ==="
sleep 10
curl -sSf "${COORD}/api/v1/services/m3db/namespace/ready" \
-H "Content-Type: application/json" \
-d '{ "name": "default" }' || echo "Namespace not ready yet — this is normal, bootstrapping takes a few minutes."
echo ""
echo "=== M3DB cluster initialization complete ==="
echo "Internal endpoints (in-cluster):"
echo " Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
echo " Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
echo " PromQL queries → ${COORD}/api/v1/query"
echo ""
echo "External endpoints (cross-cluster):"
echo " Get LB IP: kubectl -n m3db get svc m3coordinator-lb"
echo " Prometheus remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write"
echo " Prometheus remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read"