- Changed all namespace block sizes to 1h (was 2h/12h/24h in manifests, 30d+ in the live cluster due to backfill-era bufferPast hacks) - Deleted entire backfill/ directory (scripts, pods, runbooks) - Removed stale 05-m3coordinator.yaml (had backfill namespaces) - Added 05-m3coordinator-deployment.yaml to kustomization - Fixed init job health check (/health instead of /api/v1/services/m3db/health) - Updated .env.example (removed Mimir credentials) - Added 'Why Backfill Doesn't Work' section to README
223 lines
8.0 KiB
YAML
223 lines
8.0 KiB
YAML
##############################################################################
|
|
# PodDisruptionBudgets — keep quorum during rolling updates
|
|
##############################################################################
|
|
|
|
apiVersion: policy/v1
|
|
kind: PodDisruptionBudget
|
|
metadata:
|
|
name: m3dbnode-pdb
|
|
namespace: m3db
|
|
spec:
|
|
minAvailable: 2
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: m3dbnode
|
|
|
|
---
|
|
|
|
apiVersion: policy/v1
|
|
kind: PodDisruptionBudget
|
|
metadata:
|
|
name: etcd-pdb
|
|
namespace: m3db
|
|
spec:
|
|
minAvailable: 2
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: etcd
|
|
|
|
---
|
|
|
|
##############################################################################
|
|
# Cluster Init Job
|
|
# Run ONCE after all m3dbnode pods are Ready to:
|
|
# 1. Create the placement (topology)
|
|
# 2. Create the namespaces (retention policies)
|
|
# 3. Wait for the cluster to initialize
|
|
#
|
|
# kubectl apply -f 06-init-and-pdb.yaml
|
|
# (then monitor with: kubectl logs -n m3db job/m3db-cluster-init)
|
|
##############################################################################
|
|
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: m3db-cluster-init
|
|
namespace: m3db
|
|
labels:
|
|
app.kubernetes.io/name: m3db-init
|
|
app.kubernetes.io/part-of: m3db
|
|
spec:
|
|
backoffLimit: 5
|
|
ttlSecondsAfterFinished: 3600
|
|
template:
|
|
spec:
|
|
restartPolicy: OnFailure
|
|
containers:
|
|
- name: init
|
|
image: curlimages/curl:8.7.1
|
|
command:
|
|
- /bin/sh
|
|
- -exc
|
|
- |
|
|
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
|
|
|
|
echo "=== Waiting for coordinator to be healthy ==="
|
|
until curl -sf "${COORD}/health"; do
|
|
echo "Coordinator not ready yet, retrying in 5s..."
|
|
sleep 5
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Creating M3DB placement ==="
|
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/placement/init" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"num_shards": 64,
|
|
"replication_factor": 3,
|
|
"instances": [
|
|
{
|
|
"id": "m3dbnode-0",
|
|
"isolation_group": "zone-a",
|
|
"zone": "embedded",
|
|
"weight": 100,
|
|
"endpoint": "m3dbnode-0.m3dbnode.m3db.svc.cluster.local:9000",
|
|
"hostname": "m3dbnode-0",
|
|
"port": 9000
|
|
},
|
|
{
|
|
"id": "m3dbnode-1",
|
|
"isolation_group": "zone-b",
|
|
"zone": "embedded",
|
|
"weight": 100,
|
|
"endpoint": "m3dbnode-1.m3dbnode.m3db.svc.cluster.local:9000",
|
|
"hostname": "m3dbnode-1",
|
|
"port": 9000
|
|
},
|
|
{
|
|
"id": "m3dbnode-2",
|
|
"isolation_group": "zone-c",
|
|
"zone": "embedded",
|
|
"weight": 100,
|
|
"endpoint": "m3dbnode-2.m3dbnode.m3db.svc.cluster.local:9000",
|
|
"hostname": "m3dbnode-2",
|
|
"port": 9000
|
|
}
|
|
]
|
|
}'
|
|
|
|
echo ""
|
|
echo "=== Creating unaggregated namespace (48h retention) ==="
|
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"name": "default",
|
|
"options": {
|
|
"bootstrapEnabled": true,
|
|
"flushEnabled": true,
|
|
"writesToCommitLog": true,
|
|
"cleanupEnabled": true,
|
|
"snapshotEnabled": true,
|
|
"repairEnabled": false,
|
|
"retentionOptions": {
|
|
"retentionPeriodDuration": "48h",
|
|
"blockSizeDuration": "1h",
|
|
"bufferFutureDuration": "10m",
|
|
"bufferPastDuration": "10m"
|
|
},
|
|
"indexOptions": {
|
|
"enabled": true,
|
|
"blockSizeDuration": "1h"
|
|
}
|
|
}
|
|
}'
|
|
|
|
echo ""
|
|
echo "=== Creating aggregated namespace: 10s resolution, 30d retention ==="
|
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"name": "agg_10s_30d",
|
|
"options": {
|
|
"bootstrapEnabled": true,
|
|
"flushEnabled": true,
|
|
"writesToCommitLog": true,
|
|
"cleanupEnabled": true,
|
|
"snapshotEnabled": true,
|
|
"retentionOptions": {
|
|
"retentionPeriodDuration": "720h",
|
|
"blockSizeDuration": "1h",
|
|
"bufferFutureDuration": "10m",
|
|
"bufferPastDuration": "10m"
|
|
},
|
|
"indexOptions": {
|
|
"enabled": true,
|
|
"blockSizeDuration": "1h"
|
|
},
|
|
"aggregationOptions": {
|
|
"aggregations": [
|
|
{
|
|
"aggregated": true,
|
|
"attributes": {
|
|
"resolutionDuration": "10s"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}'
|
|
|
|
echo ""
|
|
echo "=== Creating aggregated namespace: 1m resolution, 1y retention ==="
|
|
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{
|
|
"name": "agg_1m_1y",
|
|
"options": {
|
|
"bootstrapEnabled": true,
|
|
"flushEnabled": true,
|
|
"writesToCommitLog": true,
|
|
"cleanupEnabled": true,
|
|
"snapshotEnabled": true,
|
|
"retentionOptions": {
|
|
"retentionPeriodDuration": "8760h",
|
|
"blockSizeDuration": "1h",
|
|
"bufferFutureDuration": "10m",
|
|
"bufferPastDuration": "10m"
|
|
},
|
|
"indexOptions": {
|
|
"enabled": true,
|
|
"blockSizeDuration": "1h"
|
|
},
|
|
"aggregationOptions": {
|
|
"aggregations": [
|
|
{
|
|
"aggregated": true,
|
|
"attributes": {
|
|
"resolutionDuration": "1m"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}'
|
|
|
|
echo ""
|
|
echo "=== Waiting for namespace initialization ==="
|
|
sleep 10
|
|
curl -sSf "${COORD}/api/v1/services/m3db/namespace/ready" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{ "name": "default" }' || echo "Namespace not ready yet — this is normal, bootstrapping takes a few minutes."
|
|
|
|
echo ""
|
|
echo "=== M3DB cluster initialization complete ==="
|
|
echo "Internal endpoints (in-cluster):"
|
|
echo " Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
|
|
echo " Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
|
|
echo " PromQL queries → ${COORD}/api/v1/query"
|
|
echo ""
|
|
echo "External endpoints (cross-cluster):"
|
|
echo " Get LB IP: kubectl -n m3db get svc m3coordinator-lb"
|
|
echo " Prometheus remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write"
|
|
echo " Prometheus remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read"
|