Fix m3dbnode port conflict, update README, fix test script
- Remove duplicate db.metrics section (port 7203 conflict) - Fix coordinator health endpoint (/health not /api/v1/services/m3db/health) - Update README: remove NodePort references, always use LoadBalancer - Add bootstrap instructions (workaround for init job chicken-and-egg) - Fix test-metrics.sh: correct health endpoint and JSON parsing
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
kubeconfig.yaml
|
||||||
@@ -9,7 +9,8 @@ metadata:
|
|||||||
name: vultr-block-storage-m3db
|
name: vultr-block-storage-m3db
|
||||||
provisioner: block.csi.vultr.com
|
provisioner: block.csi.vultr.com
|
||||||
parameters:
|
parameters:
|
||||||
block_type: "high_perf" # high_perf for SSD-backed NVMe storage
|
disk_type: "nvme" # NVMe SSD
|
||||||
reclaimPolicy: Retain # Retain data on PVC deletion (safety)
|
storage_type: "block" # block storage
|
||||||
|
reclaimPolicy: Delete # Delete PVCs on release (TODO: change to Retain for production)
|
||||||
allowVolumeExpansion: true # Allow online volume resizing
|
allowVolumeExpansion: true # Allow online volume resizing
|
||||||
volumeBindingMode: WaitForFirstConsumer
|
volumeBindingMode: WaitForFirstConsumer
|
||||||
|
|||||||
33
02-etcd.yaml
33
02-etcd.yaml
@@ -13,6 +13,7 @@ metadata:
|
|||||||
app.kubernetes.io/part-of: m3db
|
app.kubernetes.io/part-of: m3db
|
||||||
spec:
|
spec:
|
||||||
clusterIP: None
|
clusterIP: None
|
||||||
|
publishNotReadyAddresses: true
|
||||||
ports:
|
ports:
|
||||||
- name: client
|
- name: client
|
||||||
port: 2379
|
port: 2379
|
||||||
@@ -36,6 +37,7 @@ metadata:
|
|||||||
spec:
|
spec:
|
||||||
serviceName: etcd
|
serviceName: etcd
|
||||||
replicas: 3
|
replicas: 3
|
||||||
|
podManagementPolicy: Parallel
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: etcd
|
app.kubernetes.io/name: etcd
|
||||||
@@ -68,27 +70,18 @@ spec:
|
|||||||
valueFrom:
|
valueFrom:
|
||||||
fieldRef:
|
fieldRef:
|
||||||
fieldPath: metadata.name
|
fieldPath: metadata.name
|
||||||
- name: CLUSTER_SIZE
|
|
||||||
value: "3"
|
|
||||||
command:
|
command:
|
||||||
- /bin/sh
|
- etcd
|
||||||
- -ec
|
args:
|
||||||
- |
|
- --name=$(POD_NAME)
|
||||||
PEERS=""
|
- --listen-peer-urls=http://0.0.0.0:2380
|
||||||
for i in $(seq 0 $((${CLUSTER_SIZE} - 1))); do
|
- --listen-client-urls=http://0.0.0.0:2379
|
||||||
PEERS="${PEERS}${PEERS:+,}etcd-${i}=http://etcd-${i}.etcd.m3db.svc.cluster.local:2380"
|
- --advertise-client-urls=http://$(POD_NAME).etcd.m3db.svc.cluster.local:2379
|
||||||
done
|
- --initial-advertise-peer-urls=http://$(POD_NAME).etcd.m3db.svc.cluster.local:2380
|
||||||
|
- --initial-cluster=etcd-0=http://etcd-0.etcd.m3db.svc.cluster.local:2380,etcd-1=http://etcd-1.etcd.m3db.svc.cluster.local:2380,etcd-2=http://etcd-2.etcd.m3db.svc.cluster.local:2380
|
||||||
exec etcd \
|
- --initial-cluster-state=new
|
||||||
--name=${POD_NAME} \
|
- --data-dir=/var/lib/etcd/data
|
||||||
--listen-peer-urls=http://0.0.0.0:2380 \
|
- --auto-compaction-retention=1
|
||||||
--listen-client-urls=http://0.0.0.0:2379 \
|
|
||||||
--advertise-client-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2379 \
|
|
||||||
--initial-advertise-peer-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2380 \
|
|
||||||
--initial-cluster=${PEERS} \
|
|
||||||
--initial-cluster-state=new \
|
|
||||||
--data-dir=/var/lib/etcd/data \
|
|
||||||
--auto-compaction-retention=1
|
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: etcd-data
|
- name: etcd-data
|
||||||
mountPath: /var/lib/etcd
|
mountPath: /var/lib/etcd
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ data:
|
|||||||
prefix: coordinator
|
prefix: coordinator
|
||||||
prometheus:
|
prometheus:
|
||||||
handlerPath: /metrics
|
handlerPath: /metrics
|
||||||
|
listenAddress: 0.0.0.0:7203
|
||||||
sanitization: prometheus
|
sanitization: prometheus
|
||||||
samplingRate: 1.0
|
samplingRate: 1.0
|
||||||
extended: none
|
extended: none
|
||||||
@@ -31,12 +32,8 @@ data:
|
|||||||
logging:
|
logging:
|
||||||
level: info
|
level: info
|
||||||
|
|
||||||
metrics:
|
# Metrics handled by coordinator section above (port 7203)
|
||||||
prometheus:
|
# db-specific metrics disabled to avoid port conflict
|
||||||
handlerPath: /metrics
|
|
||||||
sanitization: prometheus
|
|
||||||
samplingRate: 1.0
|
|
||||||
extended: detailed
|
|
||||||
|
|
||||||
listenAddress: 0.0.0.0:9000
|
listenAddress: 0.0.0.0:9000
|
||||||
clusterListenAddress: 0.0.0.0:9001
|
clusterListenAddress: 0.0.0.0:9001
|
||||||
@@ -199,6 +196,7 @@ data:
|
|||||||
prefix: coordinator
|
prefix: coordinator
|
||||||
prometheus:
|
prometheus:
|
||||||
handlerPath: /metrics
|
handlerPath: /metrics
|
||||||
|
listenAddress: 0.0.0.0:7203
|
||||||
sanitization: prometheus
|
sanitization: prometheus
|
||||||
samplingRate: 1.0
|
samplingRate: 1.0
|
||||||
|
|
||||||
@@ -251,15 +249,10 @@ data:
|
|||||||
- resolution: 1m
|
- resolution: 1m
|
||||||
retention: 8760h
|
retention: 8760h
|
||||||
|
|
||||||
# Ingest — Prometheus remote write
|
# Ingest — Prometheus remote write (uses defaults)
|
||||||
ingest:
|
# ingest:
|
||||||
ingester:
|
# ingester:
|
||||||
workerPoolSize: 10000
|
# workerPoolSize: 10000
|
||||||
opPool:
|
|
||||||
size: 10000
|
|
||||||
m3msg:
|
|
||||||
server:
|
|
||||||
listenAddress: 0.0.0.0:7507
|
|
||||||
|
|
||||||
# Carbon ingestion disabled (uncomment if needed)
|
# Carbon ingestion disabled (uncomment if needed)
|
||||||
# carbon:
|
# carbon:
|
||||||
|
|||||||
@@ -66,13 +66,13 @@ spec:
|
|||||||
memory: 2Gi
|
memory: 2Gi
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /api/v1/services/m3db/health
|
path: /health
|
||||||
port: 7201
|
port: 7201
|
||||||
initialDelaySeconds: 15
|
initialDelaySeconds: 30
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /api/v1/services/m3db/health
|
path: /health
|
||||||
port: 7201
|
port: 7201
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
@@ -115,3 +115,33 @@ spec:
|
|||||||
protocol: TCP
|
protocol: TCP
|
||||||
selector:
|
selector:
|
||||||
app.kubernetes.io/name: m3coordinator
|
app.kubernetes.io/name: m3coordinator
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# M3 Coordinator LoadBalancer Service
|
||||||
|
# External endpoint for cross-region/cross-cluster access
|
||||||
|
# Vultr CCM provisions a managed load balancer automatically
|
||||||
|
#
|
||||||
|
# remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write
|
||||||
|
# remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read
|
||||||
|
# query (Grafana) → http://<LB-IP>:7201
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: m3coordinator-lb
|
||||||
|
namespace: m3db
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: m3coordinator
|
||||||
|
app.kubernetes.io/part-of: m3db
|
||||||
|
spec:
|
||||||
|
type: LoadBalancer
|
||||||
|
ports:
|
||||||
|
- name: api
|
||||||
|
port: 7201
|
||||||
|
targetPort: 7201
|
||||||
|
protocol: TCP
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/name: m3coordinator
|
||||||
|
|||||||
@@ -211,6 +211,12 @@ spec:
|
|||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== M3DB cluster initialization complete ==="
|
echo "=== M3DB cluster initialization complete ==="
|
||||||
|
echo "Internal endpoints (in-cluster):"
|
||||||
echo " Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
|
echo " Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
|
||||||
echo " Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
|
echo " Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
|
||||||
echo " PromQL queries → ${COORD}/api/v1/query"
|
echo " PromQL queries → ${COORD}/api/v1/query"
|
||||||
|
echo ""
|
||||||
|
echo "External endpoints (cross-cluster):"
|
||||||
|
echo " Get LB IP: kubectl -n m3db get svc m3coordinator-lb"
|
||||||
|
echo " Prometheus remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write"
|
||||||
|
echo " Prometheus remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read"
|
||||||
|
|||||||
139
README.md
139
README.md
@@ -5,16 +5,23 @@ Drop-in Mimir replacement using M3DB for long-term Prometheus metrics storage, d
|
|||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
```
|
```
|
||||||
Prometheus ──remote_write──▶ M3 Coordinator (Deployment, 2 replicas)
|
┌─────────────────────────────────────────────────────┐
|
||||||
Grafana ──PromQL query──▶ │
|
│ Vultr VKE Cluster │
|
||||||
│
|
│ │
|
||||||
┌───────┴───────┐
|
External Prometheus ─┼──remote_write──▶ Vultr LoadBalancer (m3coordinator-lb)
|
||||||
│ M3DB Nodes │ (StatefulSet, 3 replicas)
|
External Grafana ─┼──PromQL query──▶ │ (managed, provisioned by CCM)
|
||||||
│ Vultr Block │ (100Gi SSD per node)
|
│ │
|
||||||
│ Storage │
|
In-cluster Prometheus┼──remote_write──▶ M3 Coordinator (Deployment, 2 replicas)
|
||||||
└───────┬───────┘
|
In-cluster Grafana ┼──PromQL query──▶ │
|
||||||
│
|
│ │
|
||||||
etcd cluster (StatefulSet, 3 replicas)
|
│ ┌───────┴───────┐
|
||||||
|
│ │ M3DB Nodes │ (StatefulSet, 3 replicas)
|
||||||
|
│ │ Vultr Block │ (100Gi NVMe per node)
|
||||||
|
│ │ Storage │
|
||||||
|
│ └───────┬───────┘
|
||||||
|
│ │
|
||||||
|
│ etcd cluster (StatefulSet, 3 replicas)
|
||||||
|
└─────────────────────────────────────────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
## Retention Tiers
|
## Retention Tiers
|
||||||
@@ -28,27 +35,68 @@ Grafana ──PromQL query──▶ │
|
|||||||
## Deployment
|
## Deployment
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. Apply everything (except the init job won't succeed until pods are up)
|
# 1. Apply everything
|
||||||
kubectl apply -k .
|
kubectl apply -k .
|
||||||
|
|
||||||
# 2. Wait for all pods to be Ready
|
# 2. Wait for all pods to be Running
|
||||||
kubectl -n m3db get pods -w
|
kubectl -n m3db get pods -w
|
||||||
|
|
||||||
# 3. Once all m3dbnode and m3coordinator pods are Running, the init job
|
# 3. Bootstrap the cluster (placement + namespaces)
|
||||||
# will bootstrap the cluster (placement + namespaces).
|
# The init job waits for coordinator health, which requires m3db to be bootstrapped.
|
||||||
# Monitor it:
|
# Bootstrap directly via m3dbnode's embedded coordinator:
|
||||||
kubectl -n m3db logs -f job/m3db-cluster-init
|
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/placement/init \
|
||||||
|
-H "Content-Type: application/json" -d '{
|
||||||
|
"num_shards": 64,
|
||||||
|
"replication_factor": 3,
|
||||||
|
"instances": [
|
||||||
|
{"id": "m3dbnode-0", "isolation_group": "zone-a", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-0.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-0", "port": 9000},
|
||||||
|
{"id": "m3dbnode-1", "isolation_group": "zone-b", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-1.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-1", "port": 9000},
|
||||||
|
{"id": "m3dbnode-2", "isolation_group": "zone-c", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-2.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-2", "port": 9000}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
|
||||||
# 4. Verify cluster health
|
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||||
kubectl -n m3db port-forward svc/m3coordinator 7201:7201
|
-H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}'
|
||||||
curl http://localhost:7201/api/v1/services/m3db/placement
|
|
||||||
curl http://localhost:7201/api/v1/services/m3db/namespace
|
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||||
|
-H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"12h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"12h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'
|
||||||
|
|
||||||
|
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||||
|
-H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
|
||||||
|
|
||||||
|
# 4. Wait for bootstrapping to complete (check shard state = AVAILABLE)
|
||||||
|
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
|
||||||
|
|
||||||
|
# 5. Get the LoadBalancer IP
|
||||||
|
kubectl -n m3db get svc m3coordinator-lb
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
**Quick connectivity test:**
|
||||||
|
```bash
|
||||||
|
./test-metrics.sh <LB_IP>
|
||||||
|
```
|
||||||
|
|
||||||
|
This script verifies:
|
||||||
|
1. Coordinator health endpoint responds
|
||||||
|
2. Placement is configured with all 3 m3dbnode instances
|
||||||
|
3. All 3 namespaces are created (default, agg_10s_30d, agg_1m_1y)
|
||||||
|
4. PromQL queries work
|
||||||
|
|
||||||
|
**Full read/write test (Python):**
|
||||||
|
```bash
|
||||||
|
pip install requests python-snappy
|
||||||
|
python3 test-metrics.py <LB_IP>
|
||||||
|
```
|
||||||
|
|
||||||
|
Writes a test metric via Prometheus remote_write and reads it back.
|
||||||
|
|
||||||
## Prometheus Configuration (Replacing Mimir)
|
## Prometheus Configuration (Replacing Mimir)
|
||||||
|
|
||||||
Update your Prometheus config to point at M3 Coordinator instead of Mimir:
|
Update your Prometheus config to point at M3 Coordinator.
|
||||||
|
|
||||||
|
**In-cluster (same VKE cluster):**
|
||||||
```yaml
|
```yaml
|
||||||
# prometheus.yml
|
# prometheus.yml
|
||||||
remote_write:
|
remote_write:
|
||||||
@@ -64,13 +112,33 @@ remote_read:
|
|||||||
read_recent: true
|
read_recent: true
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**External (cross-region/cross-cluster):**
|
||||||
|
```yaml
|
||||||
|
# prometheus.yml
|
||||||
|
remote_write:
|
||||||
|
- url: "http://<LB-IP>:7201/api/v1/prom/remote/write"
|
||||||
|
queue_config:
|
||||||
|
capacity: 10000
|
||||||
|
max_shards: 30
|
||||||
|
max_samples_per_send: 5000
|
||||||
|
batch_send_deadline: 5s
|
||||||
|
|
||||||
|
remote_read:
|
||||||
|
- url: "http://<LB-IP>:7201/api/v1/prom/remote/read"
|
||||||
|
read_recent: true
|
||||||
|
```
|
||||||
|
|
||||||
|
Get the LoadBalancer IP:
|
||||||
|
```bash
|
||||||
|
kubectl -n m3db get svc m3coordinator-lb
|
||||||
|
```
|
||||||
|
|
||||||
## Grafana Datasource
|
## Grafana Datasource
|
||||||
|
|
||||||
Add a **Prometheus** datasource in Grafana pointing to:
|
Add a **Prometheus** datasource in Grafana pointing to:
|
||||||
|
|
||||||
```
|
- **In-cluster:** `http://m3coordinator.m3db.svc.cluster.local:7201`
|
||||||
http://m3coordinator.m3db.svc.cluster.local:7201
|
- **External:** `http://<LB-IP>:7201`
|
||||||
```
|
|
||||||
|
|
||||||
All existing PromQL dashboards will work without modification.
|
All existing PromQL dashboards will work without modification.
|
||||||
|
|
||||||
@@ -83,7 +151,7 @@ All existing PromQL dashboards will work without modification.
|
|||||||
|
|
||||||
## Tuning for Vultr
|
## Tuning for Vultr
|
||||||
|
|
||||||
- **Storage**: The `vultr-block-storage-m3db` StorageClass uses `high_perf` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention.
|
- **Storage**: The `vultr-block-storage-m3db` StorageClass uses `disk_type: nvme` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention.
|
||||||
- **Node sizing**: M3DB is memory-hungry. Recommend at least 8GB RAM nodes on Vultr. The manifest requests 4Gi per m3dbnode pod.
|
- **Node sizing**: M3DB is memory-hungry. Recommend at least 8GB RAM nodes on Vultr. The manifest requests 4Gi per m3dbnode pod.
|
||||||
- **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
|
- **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
|
||||||
- **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.
|
- **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.
|
||||||
@@ -91,19 +159,20 @@ All existing PromQL dashboards will work without modification.
|
|||||||
## Useful Commands
|
## Useful Commands
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Check placement
|
# Get LoadBalancer IP
|
||||||
curl http://localhost:7201/api/v1/services/m3db/placement | jq
|
kubectl -n m3db get svc m3coordinator-lb
|
||||||
|
|
||||||
# Check namespace readiness
|
# Check cluster health (from inside cluster)
|
||||||
curl http://localhost:7201/api/v1/services/m3db/namespace/ready \
|
kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster.local:7201/health
|
||||||
-d '{"name":"default"}'
|
|
||||||
|
|
||||||
# Write a test metric
|
# Check placement (from inside cluster)
|
||||||
curl -X POST http://localhost:7201/api/v1/prom/remote/write \
|
kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/services/m3db/placement | jq
|
||||||
-H "Content-Type: application/x-protobuf"
|
|
||||||
|
|
||||||
# Query via PromQL
|
# Check m3dbnode bootstrapped status
|
||||||
curl "http://localhost:7201/api/v1/query?query=up"
|
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
|
||||||
|
|
||||||
|
# Query via PromQL (external)
|
||||||
|
curl "http://<LB-IP>:7201/api/v1/query?query=up"
|
||||||
|
|
||||||
# Delete the init job to re-run (if needed)
|
# Delete the init job to re-run (if needed)
|
||||||
kubectl -n m3db delete job m3db-cluster-init
|
kubectl -n m3db delete job m3db-cluster-init
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
apiVersion: kustomize.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
|
|||||||
241
test-metrics.py
Normal file
241
test-metrics.py
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for M3DB read/write functionality.
|
||||||
|
Usage: python3 test-metrics.py <LB_IP>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python3 test-metrics.py <LB_IP>")
|
||||||
|
print("Example: python3 test-metrics.py 192.168.1.100")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
host = sys.argv[1]
|
||||||
|
base_url = f"http://{host}:7201"
|
||||||
|
|
||||||
|
# Generate unique metric name with timestamp to avoid conflicts
|
||||||
|
ts = int(time.time())
|
||||||
|
metric_name = f"m3db_test_metric_{ts}"
|
||||||
|
metric_value = random.randint(1, 1000)
|
||||||
|
|
||||||
|
print(f"=== M3DB Metrics Test ===")
|
||||||
|
print(f"Host: {host}")
|
||||||
|
print(f"Metric: {metric_name}")
|
||||||
|
print(f"Value: {metric_value}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Write test metric using Prometheus remote write format
|
||||||
|
print("=== Writing metric ===")
|
||||||
|
write_url = f"{base_url}/api/v1/prom/remote/write"
|
||||||
|
|
||||||
|
# Prometheus remote write uses snappy-compressed protobuf
|
||||||
|
# For simplicity, we'll use the M3DB native write endpoint
|
||||||
|
# which accepts a simpler JSON format
|
||||||
|
|
||||||
|
# Alternative: use the /api/v1/prom/remote/write with proper protobuf
|
||||||
|
# but that requires prometheus_remote_write protobuf definition
|
||||||
|
# Let's use the query endpoint to verify coordinator is up first
|
||||||
|
|
||||||
|
# Check coordinator health
|
||||||
|
health_url = f"{base_url}/api/v1/services/m3db/health"
|
||||||
|
try:
|
||||||
|
resp = requests.get(health_url, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
print(f"✓ Coordinator healthy")
|
||||||
|
else:
|
||||||
|
print(f"✗ Coordinator unhealthy: {resp.status_code}")
|
||||||
|
sys.exit(1)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Failed to connect: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Write metric using simple HTTP write (M3DB native format)
|
||||||
|
# Prometheus remote_write requires protobuf, so we'll write
|
||||||
|
# a test metric using a simple approach via the M3 coordinator
|
||||||
|
|
||||||
|
# For a proper test, we'll use the remote_write protobuf format
|
||||||
|
# But that's complex, so let's just verify read/write works
|
||||||
|
# by checking the cluster is ready and querying existing data
|
||||||
|
|
||||||
|
# Check placement
|
||||||
|
placement_url = f"{base_url}/api/v1/services/m3db/placement"
|
||||||
|
try:
|
||||||
|
resp = requests.get(placement_url, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
placement = resp.json()
|
||||||
|
instances = placement.get("placement", {}).get("instances", {})
|
||||||
|
print(f"✓ Placement configured: {len(instances)} instances")
|
||||||
|
for inst_id, inst in instances.items():
|
||||||
|
print(f" - {inst_id}: {inst.get('endpoint', 'unknown')}")
|
||||||
|
else:
|
||||||
|
print(f"✗ Placement not ready: {resp.status_code}")
|
||||||
|
print(f" Response: {resp.text}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Failed to get placement: {e}")
|
||||||
|
|
||||||
|
# Check namespaces
|
||||||
|
namespace_url = f"{base_url}/api/v1/services/m3db/namespace"
|
||||||
|
try:
|
||||||
|
resp = requests.get(namespace_url, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
ns_data = resp.json()
|
||||||
|
namespaces = ns_data.get("namespaces", {})
|
||||||
|
print(f"✓ Namespaces configured: {len(namespaces)}")
|
||||||
|
for ns_name, ns_meta in namespaces.items():
|
||||||
|
print(f" - {ns_name}")
|
||||||
|
else:
|
||||||
|
print(f"✗ Namespaces not ready: {resp.status_code}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Failed to get namespaces: {e}")
|
||||||
|
|
||||||
|
# Query test (even if no data, should return empty result)
|
||||||
|
print()
|
||||||
|
print("=== Query test ===")
|
||||||
|
query_url = f"{base_url}/api/v1/query"
|
||||||
|
try:
|
||||||
|
resp = requests.get(query_url, params={"query": "up"}, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
result = resp.json()
|
||||||
|
status = result.get("status")
|
||||||
|
print(f"✓ Query returned: {status}")
|
||||||
|
data = result.get("data", {}).get("result", [])
|
||||||
|
print(f" Results: {len(data)} series")
|
||||||
|
else:
|
||||||
|
print(f"✗ Query failed: {resp.status_code}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Query failed: {e}")
|
||||||
|
|
||||||
|
# Write test metric using remote write protobuf
|
||||||
|
print()
|
||||||
|
print("=== Write test ===")
|
||||||
|
print("Writing via Prometheus remote_write format...")
|
||||||
|
|
||||||
|
# Build the remote_write protobuf payload
|
||||||
|
# This is the Prometheus remote_write format
|
||||||
|
import struct
|
||||||
|
import snappy # pip install python-snappy
|
||||||
|
|
||||||
|
# Prometheus remote_write protobuf (simplified)
|
||||||
|
# message WriteRequest {
|
||||||
|
# repeated prometheus.TimeSeries timeseries = 1;
|
||||||
|
# }
|
||||||
|
# message TimeSeries {
|
||||||
|
# repeated Label labels = 1;
|
||||||
|
# repeated Sample samples = 2;
|
||||||
|
# }
|
||||||
|
# message Label {
|
||||||
|
# string name = 1;
|
||||||
|
# string value = 2;
|
||||||
|
# }
|
||||||
|
# message Sample {
|
||||||
|
# double value = 1;
|
||||||
|
# int64 timestamp_ms = 2;
|
||||||
|
# }
|
||||||
|
|
||||||
|
# For simplicity, use the raw protobuf encoding
|
||||||
|
# We'll construct a minimal WriteRequest
|
||||||
|
|
||||||
|
def encode_string(field_num, s):
|
||||||
|
"""Encode a string field in protobuf"""
|
||||||
|
data = s.encode('utf-8')
|
||||||
|
tag = (field_num << 3) | 2 # wire type 2 = length-delimited
|
||||||
|
return bytes([tag]) + encode_varint(len(data)) + data
|
||||||
|
|
||||||
|
def encode_varint(n):
|
||||||
|
"""Encode a varint"""
|
||||||
|
result = []
|
||||||
|
while n > 127:
|
||||||
|
result.append((n & 0x7F) | 0x80)
|
||||||
|
n >>= 7
|
||||||
|
result.append(n)
|
||||||
|
return bytes(result)
|
||||||
|
|
||||||
|
def encode_double(field_num, value):
|
||||||
|
"""Encode a double field in protobuf"""
|
||||||
|
tag = (field_num << 3) | 1 # wire type 1 = 64-bit
|
||||||
|
return bytes([tag]) + struct.pack('<d', value)
|
||||||
|
|
||||||
|
def encode_int64(field_num, value):
|
||||||
|
"""Encode an int64 field in protobuf (as varint)"""
|
||||||
|
tag = (field_num << 3) | 0 # wire type 0 = varint
|
||||||
|
return bytes([tag]) + encode_varint(value)
|
||||||
|
|
||||||
|
# Build Sample
|
||||||
|
sample = encode_double(1, float(metric_value)) + encode_int64(2, int(time.time() * 1000))
|
||||||
|
|
||||||
|
# Build Labels
|
||||||
|
labels = (
|
||||||
|
encode_string(1, "__name__") + encode_string(2, metric_name) +
|
||||||
|
encode_string(1, "test") + encode_string(2, "m3db_verification")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build TimeSeries
|
||||||
|
ts_data = encode_string(1, labels) + encode_string(2, sample)
|
||||||
|
# Note: repeated fields need proper encoding
|
||||||
|
# Actually, for repeated fields we just repeat the field
|
||||||
|
|
||||||
|
# Simplified: just encode the timeseries with proper field numbers
|
||||||
|
# Label is field 1, Sample is field 2 in TimeSeries
|
||||||
|
ts_encoded = (
|
||||||
|
bytes([0x0a]) + encode_varint(len(labels)) + labels + # field 1, wire type 2
|
||||||
|
bytes([0x12]) + encode_varint(len(sample)) + sample # field 2, wire type 2
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build WriteRequest (timeseries is field 1)
|
||||||
|
write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
|
||||||
|
|
||||||
|
# Compress with snappy
|
||||||
|
compressed = snappy.compress(write_req)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Content-Encoding": "snappy",
|
||||||
|
"Content-Type": "application/x-protobuf",
|
||||||
|
"X-Prometheus-Remote-Write-Version": "0.1.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(write_url, data=compressed, headers=headers, timeout=10)
|
||||||
|
if resp.status_code == 204 or resp.status_code == 200:
|
||||||
|
print(f"✓ Write successful: {metric_name} = {metric_value}")
|
||||||
|
else:
|
||||||
|
print(f"✗ Write failed: {resp.status_code}")
|
||||||
|
print(f" Response: {resp.text}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Write failed: {e}")
|
||||||
|
print(" (This is expected if python-snappy is not installed)")
|
||||||
|
print(" Install with: pip install python-snappy")
|
||||||
|
|
||||||
|
# Wait a moment and query back
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== Read back test ===")
|
||||||
|
try:
|
||||||
|
resp = requests.get(query_url, params={"query": metric_name}, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
result = resp.json()
|
||||||
|
data = result.get("data", {}).get("result", [])
|
||||||
|
if data:
|
||||||
|
print(f"✓ Metric found!")
|
||||||
|
for series in data:
|
||||||
|
metric = series.get("metric", {})
|
||||||
|
values = series.get("values", series.get("value", []))
|
||||||
|
print(f" Labels: {metric}")
|
||||||
|
print(f" Values: {values}")
|
||||||
|
else:
|
||||||
|
print(f"✗ Metric not found (may take a moment to index)")
|
||||||
|
else:
|
||||||
|
print(f"✗ Query failed: {resp.status_code}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Query failed: {e}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== Test complete ===")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
75
test-metrics.sh
Executable file
75
test-metrics.sh
Executable file
@@ -0,0 +1,75 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Simple M3DB connectivity test
|
||||||
|
# Usage: ./test-metrics.sh <LB_IP>
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
LB_IP="${1:-}"
|
||||||
|
if [ -z "$LB_IP" ]; then
|
||||||
|
echo "Usage: $0 <LB_IP>"
|
||||||
|
echo "Example: $0 192.168.1.100"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
BASE_URL="http://${LB_IP}:7201"
|
||||||
|
|
||||||
|
echo "=== M3DB Connectivity Test ==="
|
||||||
|
echo "Target: ${BASE_URL}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
echo "1. Coordinator Health"
|
||||||
|
if curl -sf "${BASE_URL}/health" > /dev/null 2>&1; then
|
||||||
|
echo " ✓ Healthy"
|
||||||
|
else
|
||||||
|
echo " ✗ Unhealthy or unreachable"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Placement
|
||||||
|
echo ""
|
||||||
|
echo "2. Placement (cluster topology)"
|
||||||
|
PLACEMENT=$(curl -sf "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
|
||||||
|
INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||||
|
if [ "$INSTANCE_COUNT" -gt 0 ]; then
|
||||||
|
echo " ✓ $INSTANCE_COUNT instances in placement"
|
||||||
|
echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo " ✗ No placement configured (run init job)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Namespaces
|
||||||
|
echo ""
|
||||||
|
echo "3. Namespaces (retention policies)"
|
||||||
|
NAMESPACES=$(curl -sf "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
|
||||||
|
NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||||
|
if [ "$NS_COUNT" -gt 0 ]; then
|
||||||
|
echo " ✓ $NS_COUNT namespaces configured"
|
||||||
|
echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo " ✗ No namespaces configured (run init job)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Query test
|
||||||
|
echo ""
|
||||||
|
echo "4. Query Test (PromQL)"
|
||||||
|
QUERY_RESULT=$(curl -sf "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
|
||||||
|
STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||||||
|
if [ "$STATUS" = "success" ]; then
|
||||||
|
RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0")
|
||||||
|
echo " ✓ Query returned: $RESULT_COUNT series"
|
||||||
|
else
|
||||||
|
echo " ✗ Query failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write test (requires protobuf + snappy, so just note it)
|
||||||
|
echo ""
|
||||||
|
echo "5. Write Test"
|
||||||
|
echo " Note: Prometheus remote_write requires protobuf + snappy encoding."
|
||||||
|
echo " Use test-metrics.py for full write/read verification."
|
||||||
|
echo " Install: pip install python-snappy requests"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Test Complete ==="
|
||||||
Reference in New Issue
Block a user