init commit
This commit is contained in:
13
00-namespace.yaml
Normal file
13
00-namespace.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
##############################################################################
|
||||
# M3DB on Vultr Kubernetes Engine
|
||||
# Replaces Mimir for long-term metrics storage
|
||||
# Uses Vultr Block Storage CSI for persistent volumes
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3db
|
||||
app.kubernetes.io/part-of: metrics-platform
|
||||
15
01-storageclass.yaml
Normal file
15
01-storageclass.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
##############################################################################
|
||||
# StorageClass — Vultr Block Storage CSI
|
||||
# Uses Vultr's CSI driver (csi.vultr.com) for dynamic provisioning
|
||||
##############################################################################
|
||||
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: vultr-block-storage-m3db
|
||||
provisioner: block.csi.vultr.com
|
||||
parameters:
|
||||
block_type: "high_perf" # high_perf for SSD-backed NVMe storage
|
||||
reclaimPolicy: Retain # Retain data on PVC deletion (safety)
|
||||
allowVolumeExpansion: true # Allow online volume resizing
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
122
02-etcd.yaml
Normal file
122
02-etcd.yaml
Normal file
@@ -0,0 +1,122 @@
|
||||
##############################################################################
|
||||
# etcd cluster for M3DB placement & topology
|
||||
# M3DB requires etcd for cluster coordination
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: etcd
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: etcd
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: client
|
||||
port: 2379
|
||||
targetPort: 2379
|
||||
- name: peer
|
||||
port: 2380
|
||||
targetPort: 2380
|
||||
selector:
|
||||
app.kubernetes.io/name: etcd
|
||||
|
||||
---
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: etcd
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: etcd
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
serviceName: etcd
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: etcd
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: etcd
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values:
|
||||
- etcd
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: etcd
|
||||
image: quay.io/coreos/etcd:v3.5.15
|
||||
ports:
|
||||
- containerPort: 2379
|
||||
name: client
|
||||
- containerPort: 2380
|
||||
name: peer
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: CLUSTER_SIZE
|
||||
value: "3"
|
||||
command:
|
||||
- /bin/sh
|
||||
- -ec
|
||||
- |
|
||||
PEERS=""
|
||||
for i in $(seq 0 $((${CLUSTER_SIZE} - 1))); do
|
||||
PEERS="${PEERS}${PEERS:+,}etcd-${i}=http://etcd-${i}.etcd.m3db.svc.cluster.local:2380"
|
||||
done
|
||||
|
||||
exec etcd \
|
||||
--name=${POD_NAME} \
|
||||
--listen-peer-urls=http://0.0.0.0:2380 \
|
||||
--listen-client-urls=http://0.0.0.0:2379 \
|
||||
--advertise-client-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2379 \
|
||||
--initial-advertise-peer-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2380 \
|
||||
--initial-cluster=${PEERS} \
|
||||
--initial-cluster-state=new \
|
||||
--data-dir=/var/lib/etcd/data \
|
||||
--auto-compaction-retention=1
|
||||
volumeMounts:
|
||||
- name: etcd-data
|
||||
mountPath: /var/lib/etcd
|
||||
resources:
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 2379
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 2379
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: etcd-data
|
||||
spec:
|
||||
storageClassName: vultr-block-storage-m3db
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
267
03-configmaps.yaml
Normal file
267
03-configmaps.yaml
Normal file
@@ -0,0 +1,267 @@
|
||||
##############################################################################
|
||||
# M3DB Configuration
|
||||
# Tuned for replacing Mimir — supports Prometheus remote write/read
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: m3db-config
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3db
|
||||
data:
|
||||
m3dbnode.yml: |
|
||||
coordinator:
|
||||
listenAddress: 0.0.0.0:7201
|
||||
metrics:
|
||||
scope:
|
||||
prefix: coordinator
|
||||
prometheus:
|
||||
handlerPath: /metrics
|
||||
sanitization: prometheus
|
||||
samplingRate: 1.0
|
||||
extended: none
|
||||
|
||||
# Prometheus remote write/read endpoints (Mimir replacement)
|
||||
tagOptions:
|
||||
idScheme: quoted
|
||||
|
||||
db:
|
||||
logging:
|
||||
level: info
|
||||
|
||||
metrics:
|
||||
prometheus:
|
||||
handlerPath: /metrics
|
||||
sanitization: prometheus
|
||||
samplingRate: 1.0
|
||||
extended: detailed
|
||||
|
||||
listenAddress: 0.0.0.0:9000
|
||||
clusterListenAddress: 0.0.0.0:9001
|
||||
httpNodeListenAddress: 0.0.0.0:9002
|
||||
httpClusterListenAddress: 0.0.0.0:9003
|
||||
debugListenAddress: 0.0.0.0:9004
|
||||
|
||||
hostID:
|
||||
resolver: hostname
|
||||
|
||||
client:
|
||||
writeConsistencyLevel: majority
|
||||
readConsistencyLevel: unstrict_majority
|
||||
writeTimeout: 10s
|
||||
fetchTimeout: 15s
|
||||
connectTimeout: 20s
|
||||
writeRetry:
|
||||
initialBackoff: 500ms
|
||||
backoffFactor: 3
|
||||
maxRetries: 2
|
||||
jitter: true
|
||||
fetchRetry:
|
||||
initialBackoff: 500ms
|
||||
backoffFactor: 2
|
||||
maxRetries: 3
|
||||
jitter: true
|
||||
|
||||
# Cluster discovery via etcd
|
||||
discovery:
|
||||
config:
|
||||
service:
|
||||
env: default_env
|
||||
zone: embedded
|
||||
service: m3db
|
||||
cacheDir: /var/lib/m3kv
|
||||
etcdClusters:
|
||||
- zone: embedded
|
||||
endpoints:
|
||||
- http://etcd-0.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-1.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-2.etcd.m3db.svc.cluster.local:2379
|
||||
|
||||
# Cache configuration
|
||||
cache:
|
||||
series:
|
||||
policy: lru
|
||||
postingsList:
|
||||
size: 262144
|
||||
|
||||
# Commit log
|
||||
commitlog:
|
||||
flushMaxBytes: 524288
|
||||
flushEvery: 1s
|
||||
queue:
|
||||
calculationType: fixed
|
||||
size: 2097152
|
||||
|
||||
# Filesystem (data persistence)
|
||||
filesystem:
|
||||
filePathPrefix: /var/lib/m3db
|
||||
writeBufferSize: 65536
|
||||
dataReadBufferSize: 65536
|
||||
infoReadBufferSize: 128
|
||||
seekReadBufferSize: 4096
|
||||
throughputLimitMbps: 1000.0
|
||||
throughputCheckEvery: 128
|
||||
|
||||
# Repair disabled by default — enable once cluster is stable
|
||||
repair:
|
||||
enabled: false
|
||||
|
||||
# Pooling for performance
|
||||
pooling:
|
||||
blockAllocSize: 16
|
||||
type: simple
|
||||
seriesPool:
|
||||
size: 262144
|
||||
lowWatermark: 0.7
|
||||
highWatermark: 1.0
|
||||
blockPool:
|
||||
size: 262144
|
||||
lowWatermark: 0.7
|
||||
highWatermark: 1.0
|
||||
encoderPool:
|
||||
size: 262144
|
||||
lowWatermark: 0.7
|
||||
highWatermark: 1.0
|
||||
segmentReaderPool:
|
||||
size: 16384
|
||||
lowWatermark: 0.2
|
||||
highWatermark: 1.0
|
||||
iteratorPool:
|
||||
size: 2048
|
||||
lowWatermark: 0.2
|
||||
highWatermark: 1.0
|
||||
fetchBlockMetadataResultsPool:
|
||||
size: 65536
|
||||
capacity: 32
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
fetchBlocksMetadataResultsPool:
|
||||
size: 32
|
||||
capacity: 4096
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
bytesPool:
|
||||
buckets:
|
||||
- capacity: 16
|
||||
size: 524288
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 32
|
||||
size: 262144
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 64
|
||||
size: 131072
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 128
|
||||
size: 65536
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 256
|
||||
size: 65536
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 1440
|
||||
size: 16384
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
- capacity: 4096
|
||||
size: 8192
|
||||
lowWatermark: 0.01
|
||||
highWatermark: 1.0
|
||||
|
||||
---
|
||||
|
||||
##############################################################################
|
||||
# M3 Coordinator standalone config
|
||||
# Handles Prometheus remote read/write + Grafana queries
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: m3coordinator-config
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
data:
|
||||
m3coordinator.yml: |
|
||||
listenAddress: 0.0.0.0:7201
|
||||
|
||||
logging:
|
||||
level: info
|
||||
|
||||
metrics:
|
||||
scope:
|
||||
prefix: coordinator
|
||||
prometheus:
|
||||
handlerPath: /metrics
|
||||
sanitization: prometheus
|
||||
samplingRate: 1.0
|
||||
|
||||
tagOptions:
|
||||
idScheme: quoted
|
||||
|
||||
clusters:
|
||||
- namespaces:
|
||||
- namespace: default
|
||||
type: unaggregated
|
||||
retention: 48h
|
||||
- namespace: agg_10s_30d
|
||||
type: aggregated
|
||||
retention: 720h
|
||||
resolution: 10s
|
||||
- namespace: agg_1m_1y
|
||||
type: aggregated
|
||||
retention: 8760h
|
||||
resolution: 1m
|
||||
client:
|
||||
config:
|
||||
service:
|
||||
env: default_env
|
||||
zone: embedded
|
||||
service: m3db
|
||||
cacheDir: /var/lib/m3kv
|
||||
etcdClusters:
|
||||
- zone: embedded
|
||||
endpoints:
|
||||
- http://etcd-0.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-1.etcd.m3db.svc.cluster.local:2379
|
||||
- http://etcd-2.etcd.m3db.svc.cluster.local:2379
|
||||
writeConsistencyLevel: majority
|
||||
readConsistencyLevel: unstrict_majority
|
||||
|
||||
# Downsample configuration
|
||||
downsample:
|
||||
rules:
|
||||
mappingRules:
|
||||
- name: "10s for 30 days"
|
||||
filter: "__name__:*"
|
||||
aggregations: ["Last"]
|
||||
storagePolicies:
|
||||
- resolution: 10s
|
||||
retention: 720h
|
||||
- name: "1m for 1 year"
|
||||
filter: "__name__:*"
|
||||
aggregations: ["Last"]
|
||||
storagePolicies:
|
||||
- resolution: 1m
|
||||
retention: 8760h
|
||||
|
||||
# Ingest — Prometheus remote write
|
||||
ingest:
|
||||
ingester:
|
||||
workerPoolSize: 10000
|
||||
opPool:
|
||||
size: 10000
|
||||
m3msg:
|
||||
server:
|
||||
listenAddress: 0.0.0.0:7507
|
||||
|
||||
# Carbon ingestion disabled (uncomment if needed)
|
||||
# carbon:
|
||||
# ingester:
|
||||
# listenAddress: "0.0.0.0:7204"
|
||||
156
04-m3dbnode.yaml
Normal file
156
04-m3dbnode.yaml
Normal file
@@ -0,0 +1,156 @@
|
||||
##############################################################################
|
||||
# M3DB Node — Headless Service (for StatefulSet DNS)
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: m3dbnode
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: client
|
||||
port: 9000
|
||||
targetPort: 9000
|
||||
- name: cluster
|
||||
port: 9001
|
||||
targetPort: 9001
|
||||
- name: http-node
|
||||
port: 9002
|
||||
targetPort: 9002
|
||||
- name: http-cluster
|
||||
port: 9003
|
||||
targetPort: 9003
|
||||
- name: debug
|
||||
port: 9004
|
||||
targetPort: 9004
|
||||
- name: coordinator
|
||||
port: 7201
|
||||
targetPort: 7201
|
||||
selector:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
|
||||
---
|
||||
|
||||
##############################################################################
|
||||
# M3DB Node StatefulSet
|
||||
# 3 replicas — one per availability zone / node for HA
|
||||
##############################################################################
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: m3dbnode
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
serviceName: m3dbnode
|
||||
replicas: 3
|
||||
podManagementPolicy: Parallel
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
app.kubernetes.io/part-of: m3db
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "7203"
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values:
|
||||
- m3dbnode
|
||||
topologyKey: kubernetes.io/hostname
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
terminationGracePeriodSeconds: 120
|
||||
containers:
|
||||
- name: m3dbnode
|
||||
image: quay.io/m3db/m3dbnode:v1.5.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- "-f"
|
||||
- "/etc/m3db/m3dbnode.yml"
|
||||
ports:
|
||||
- containerPort: 9000
|
||||
name: client
|
||||
- containerPort: 9001
|
||||
name: cluster
|
||||
- containerPort: 9002
|
||||
name: http-node
|
||||
- containerPort: 9003
|
||||
name: http-cluster
|
||||
- containerPort: 9004
|
||||
name: debug
|
||||
- containerPort: 7201
|
||||
name: coordinator
|
||||
- containerPort: 7203
|
||||
name: metrics
|
||||
volumeMounts:
|
||||
- name: m3db-data
|
||||
mountPath: /var/lib/m3db
|
||||
- name: m3db-config
|
||||
mountPath: /etc/m3db
|
||||
- name: cache-dir
|
||||
mountPath: /var/lib/m3kv
|
||||
resources:
|
||||
requests:
|
||||
cpu: "1"
|
||||
memory: 4Gi
|
||||
limits:
|
||||
cpu: "2"
|
||||
memory: 8Gi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9002
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 15
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 9002
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- "sleep 30" # allow in-flight writes to drain
|
||||
volumes:
|
||||
- name: m3db-config
|
||||
configMap:
|
||||
name: m3db-config
|
||||
- name: cache-dir
|
||||
emptyDir: {}
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: m3db-data
|
||||
spec:
|
||||
storageClassName: vultr-block-storage-m3db
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi # Adjust based on retention & cardinality
|
||||
117
05-m3coordinator.yaml
Normal file
117
05-m3coordinator.yaml
Normal file
@@ -0,0 +1,117 @@
|
||||
##############################################################################
|
||||
# M3 Coordinator — Deployment
|
||||
# Stateless query/write layer — Prometheus remote_write & remote_read target
|
||||
# This is what Grafana and Prometheus talk to (replaces Mimir endpoints)
|
||||
##############################################################################
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: m3coordinator
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
app.kubernetes.io/part-of: m3db
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "7203"
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values:
|
||||
- m3coordinator
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: m3coordinator
|
||||
image: quay.io/m3db/m3coordinator:v1.5.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- "-f"
|
||||
- "/etc/m3coordinator/m3coordinator.yml"
|
||||
ports:
|
||||
- containerPort: 7201
|
||||
name: api
|
||||
protocol: TCP
|
||||
- containerPort: 7203
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/m3coordinator
|
||||
- name: cache-dir
|
||||
mountPath: /var/lib/m3kv
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 2Gi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/v1/services/m3db/health
|
||||
port: 7201
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/v1/services/m3db/health
|
||||
port: 7201
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: m3coordinator-config
|
||||
- name: cache-dir
|
||||
emptyDir: {}
|
||||
|
||||
---
|
||||
|
||||
##############################################################################
|
||||
# M3 Coordinator Service
|
||||
# Endpoints for Prometheus remote_write / remote_read / Grafana
|
||||
#
|
||||
# remote_write → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/write
|
||||
# remote_read → http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/read
|
||||
# query (Grafana Prometheus datasource) → http://m3coordinator.m3db.svc.cluster.local:7201
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: m3coordinator
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: api
|
||||
port: 7201
|
||||
targetPort: 7201
|
||||
protocol: TCP
|
||||
- name: metrics
|
||||
port: 7203
|
||||
targetPort: 7203
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
216
06-init-and-pdb.yaml
Normal file
216
06-init-and-pdb.yaml
Normal file
@@ -0,0 +1,216 @@
|
||||
##############################################################################
|
||||
# PodDisruptionBudgets — keep quorum during rolling updates
|
||||
##############################################################################
|
||||
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: m3dbnode-pdb
|
||||
namespace: m3db
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: m3dbnode
|
||||
|
||||
---
|
||||
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: etcd-pdb
|
||||
namespace: m3db
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: etcd
|
||||
|
||||
---
|
||||
|
||||
##############################################################################
|
||||
# Cluster Init Job
|
||||
# Run ONCE after all m3dbnode pods are Ready to:
|
||||
# 1. Create the placement (topology)
|
||||
# 2. Create the namespaces (retention policies)
|
||||
# 3. Wait for the cluster to initialize
|
||||
#
|
||||
# kubectl apply -f 06-init-and-pdb.yaml
|
||||
# (then monitor with: kubectl logs -n m3db job/m3db-cluster-init)
|
||||
##############################################################################
|
||||
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: m3db-cluster-init
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3db-init
|
||||
app.kubernetes.io/part-of: m3db
|
||||
spec:
|
||||
backoffLimit: 5
|
||||
ttlSecondsAfterFinished: 3600
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: init
|
||||
image: curlimages/curl:8.7.1
|
||||
command:
|
||||
- /bin/sh
|
||||
- -exc
|
||||
- |
|
||||
COORD="http://m3coordinator.m3db.svc.cluster.local:7201"
|
||||
|
||||
echo "=== Waiting for coordinator to be healthy ==="
|
||||
until curl -sf "${COORD}/api/v1/services/m3db/health"; do
|
||||
echo "Coordinator not ready yet, retrying in 5s..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Creating M3DB placement ==="
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/placement/init" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"num_shards": 64,
|
||||
"replication_factor": 3,
|
||||
"instances": [
|
||||
{
|
||||
"id": "m3dbnode-0",
|
||||
"isolation_group": "zone-a",
|
||||
"zone": "embedded",
|
||||
"weight": 100,
|
||||
"endpoint": "m3dbnode-0.m3dbnode.m3db.svc.cluster.local:9000",
|
||||
"hostname": "m3dbnode-0",
|
||||
"port": 9000
|
||||
},
|
||||
{
|
||||
"id": "m3dbnode-1",
|
||||
"isolation_group": "zone-b",
|
||||
"zone": "embedded",
|
||||
"weight": 100,
|
||||
"endpoint": "m3dbnode-1.m3dbnode.m3db.svc.cluster.local:9000",
|
||||
"hostname": "m3dbnode-1",
|
||||
"port": 9000
|
||||
},
|
||||
{
|
||||
"id": "m3dbnode-2",
|
||||
"isolation_group": "zone-c",
|
||||
"zone": "embedded",
|
||||
"weight": 100,
|
||||
"endpoint": "m3dbnode-2.m3dbnode.m3db.svc.cluster.local:9000",
|
||||
"hostname": "m3dbnode-2",
|
||||
"port": 9000
|
||||
}
|
||||
]
|
||||
}'
|
||||
|
||||
echo ""
|
||||
echo "=== Creating unaggregated namespace (48h retention) ==="
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "default",
|
||||
"options": {
|
||||
"bootstrapEnabled": true,
|
||||
"flushEnabled": true,
|
||||
"writesToCommitLog": true,
|
||||
"cleanupEnabled": true,
|
||||
"snapshotEnabled": true,
|
||||
"repairEnabled": false,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "48h",
|
||||
"blockSizeDuration": "2h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "2h"
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
echo ""
|
||||
echo "=== Creating aggregated namespace: 10s resolution, 30d retention ==="
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "agg_10s_30d",
|
||||
"options": {
|
||||
"bootstrapEnabled": true,
|
||||
"flushEnabled": true,
|
||||
"writesToCommitLog": true,
|
||||
"cleanupEnabled": true,
|
||||
"snapshotEnabled": true,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "720h",
|
||||
"blockSizeDuration": "12h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "12h"
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [
|
||||
{
|
||||
"aggregated": true,
|
||||
"attributes": {
|
||||
"resolutionDuration": "10s"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
echo ""
|
||||
echo "=== Creating aggregated namespace: 1m resolution, 1y retention ==="
|
||||
curl -sSf -X POST "${COORD}/api/v1/services/m3db/namespace" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name": "agg_1m_1y",
|
||||
"options": {
|
||||
"bootstrapEnabled": true,
|
||||
"flushEnabled": true,
|
||||
"writesToCommitLog": true,
|
||||
"cleanupEnabled": true,
|
||||
"snapshotEnabled": true,
|
||||
"retentionOptions": {
|
||||
"retentionPeriodDuration": "8760h",
|
||||
"blockSizeDuration": "24h",
|
||||
"bufferFutureDuration": "10m",
|
||||
"bufferPastDuration": "10m"
|
||||
},
|
||||
"indexOptions": {
|
||||
"enabled": true,
|
||||
"blockSizeDuration": "24h"
|
||||
},
|
||||
"aggregationOptions": {
|
||||
"aggregations": [
|
||||
{
|
||||
"aggregated": true,
|
||||
"attributes": {
|
||||
"resolutionDuration": "1m"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
echo ""
|
||||
echo "=== Waiting for namespace initialization ==="
|
||||
sleep 10
|
||||
curl -sSf "${COORD}/api/v1/services/m3db/namespace/ready" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{ "name": "default" }' || echo "Namespace not ready yet — this is normal, bootstrapping takes a few minutes."
|
||||
|
||||
echo ""
|
||||
echo "=== M3DB cluster initialization complete ==="
|
||||
echo "Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
|
||||
echo "Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
|
||||
echo "PromQL queries → ${COORD}/api/v1/query"
|
||||
111
README.md
Normal file
111
README.md
Normal file
@@ -0,0 +1,111 @@
|
||||
# M3DB on Vultr Kubernetes Engine
|
||||
|
||||
Drop-in Mimir replacement using M3DB for long-term Prometheus metrics storage, deployed on Vultr VKE with Vultr Block Storage CSI.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Prometheus ──remote_write──▶ M3 Coordinator (Deployment, 2 replicas)
|
||||
Grafana ──PromQL query──▶ │
|
||||
│
|
||||
┌───────┴───────┐
|
||||
│ M3DB Nodes │ (StatefulSet, 3 replicas)
|
||||
│ Vultr Block │ (100Gi SSD per node)
|
||||
│ Storage │
|
||||
└───────┬───────┘
|
||||
│
|
||||
etcd cluster (StatefulSet, 3 replicas)
|
||||
```
|
||||
|
||||
## Retention Tiers
|
||||
|
||||
| Namespace | Resolution | Retention | Use Case |
|
||||
|----------------|-----------|-----------|---------------------------|
|
||||
| `default` | raw | 48h | Real-time queries |
|
||||
| `agg_10s_30d` | 10s | 30 days | Recent dashboards |
|
||||
| `agg_1m_1y` | 1m | 1 year | Long-term trends/capacity |
|
||||
|
||||
## Deployment
|
||||
|
||||
```bash
|
||||
# 1. Apply everything (except the init job won't succeed until pods are up)
|
||||
kubectl apply -k .
|
||||
|
||||
# 2. Wait for all pods to be Ready
|
||||
kubectl -n m3db get pods -w
|
||||
|
||||
# 3. Once all m3dbnode and m3coordinator pods are Running, the init job
|
||||
# will bootstrap the cluster (placement + namespaces).
|
||||
# Monitor it:
|
||||
kubectl -n m3db logs -f job/m3db-cluster-init
|
||||
|
||||
# 4. Verify cluster health
|
||||
kubectl -n m3db port-forward svc/m3coordinator 7201:7201
|
||||
curl http://localhost:7201/api/v1/services/m3db/placement
|
||||
curl http://localhost:7201/api/v1/services/m3db/namespace
|
||||
```
|
||||
|
||||
## Prometheus Configuration (Replacing Mimir)
|
||||
|
||||
Update your Prometheus config to point at M3 Coordinator instead of Mimir:
|
||||
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
remote_write:
|
||||
- url: "http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/write"
|
||||
queue_config:
|
||||
capacity: 10000
|
||||
max_shards: 30
|
||||
max_samples_per_send: 5000
|
||||
batch_send_deadline: 5s
|
||||
|
||||
remote_read:
|
||||
- url: "http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/prom/remote/read"
|
||||
read_recent: true
|
||||
```
|
||||
|
||||
## Grafana Datasource
|
||||
|
||||
Add a **Prometheus** datasource in Grafana pointing to:
|
||||
|
||||
```
|
||||
http://m3coordinator.m3db.svc.cluster.local:7201
|
||||
```
|
||||
|
||||
All existing PromQL dashboards will work without modification.
|
||||
|
||||
## Migration from Mimir
|
||||
|
||||
1. **Dual-write phase**: Configure Prometheus to remote_write to both Mimir and M3DB simultaneously.
|
||||
2. **Validation**: Compare query results between Mimir and M3DB for the same time ranges.
|
||||
3. **Cutover**: Once retention in M3DB covers your needs, remove the Mimir remote_write target.
|
||||
4. **Cleanup**: Decommission Mimir components.
|
||||
|
||||
## Tuning for Vultr
|
||||
|
||||
- **Storage**: The `vultr-block-storage-m3db` StorageClass uses `high_perf` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention.
|
||||
- **Node sizing**: M3DB is memory-hungry. Recommend at least 8GB RAM nodes on Vultr. The manifest requests 4Gi per m3dbnode pod.
|
||||
- **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
|
||||
- **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.
|
||||
|
||||
## Useful Commands
|
||||
|
||||
```bash
|
||||
# Check placement
|
||||
curl http://localhost:7201/api/v1/services/m3db/placement | jq
|
||||
|
||||
# Check namespace readiness
|
||||
curl http://localhost:7201/api/v1/services/m3db/namespace/ready \
|
||||
-d '{"name":"default"}'
|
||||
|
||||
# Write a test metric
|
||||
curl -X POST http://localhost:7201/api/v1/prom/remote/write \
|
||||
-H "Content-Type: application/x-protobuf"
|
||||
|
||||
# Query via PromQL
|
||||
curl "http://localhost:7201/api/v1/query?query=up"
|
||||
|
||||
# Delete the init job to re-run (if needed)
|
||||
kubectl -n m3db delete job m3db-cluster-init
|
||||
kubectl apply -f 06-init-and-pdb.yaml
|
||||
```
|
||||
11
kustomization.yaml
Normal file
11
kustomization.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: kustomize.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- 00-namespace.yaml
|
||||
- 01-storageclass.yaml
|
||||
- 02-etcd.yaml
|
||||
- 03-configmaps.yaml
|
||||
- 04-m3dbnode.yaml
|
||||
- 05-m3coordinator.yaml
|
||||
- 06-init-and-pdb.yaml
|
||||
Reference in New Issue
Block a user