Fix m3dbnode port conflict, update README, fix test script

- Remove duplicate db.metrics section (port 7203 conflict)
- Fix coordinator health endpoint (/health not /api/v1/services/m3db/health)
- Update README: remove NodePort references, always use LoadBalancer
- Add bootstrap instructions (workaround for init job chicken-and-egg)
- Fix test-metrics.sh: correct health endpoint and JSON parsing
This commit is contained in:
2026-03-31 15:49:59 +00:00
parent ac13c30905
commit a8469f79d7
10 changed files with 488 additions and 79 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
kubeconfig.yaml

View File

@@ -9,7 +9,8 @@ metadata:
name: vultr-block-storage-m3db name: vultr-block-storage-m3db
provisioner: block.csi.vultr.com provisioner: block.csi.vultr.com
parameters: parameters:
block_type: "high_perf" # high_perf for SSD-backed NVMe storage disk_type: "nvme" # NVMe SSD
reclaimPolicy: Retain # Retain data on PVC deletion (safety) storage_type: "block" # block storage
reclaimPolicy: Delete # Delete PVCs on release (TODO: change to Retain for production)
allowVolumeExpansion: true # Allow online volume resizing allowVolumeExpansion: true # Allow online volume resizing
volumeBindingMode: WaitForFirstConsumer volumeBindingMode: WaitForFirstConsumer

View File

@@ -13,6 +13,7 @@ metadata:
app.kubernetes.io/part-of: m3db app.kubernetes.io/part-of: m3db
spec: spec:
clusterIP: None clusterIP: None
publishNotReadyAddresses: true
ports: ports:
- name: client - name: client
port: 2379 port: 2379
@@ -36,6 +37,7 @@ metadata:
spec: spec:
serviceName: etcd serviceName: etcd
replicas: 3 replicas: 3
podManagementPolicy: Parallel
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/name: etcd app.kubernetes.io/name: etcd
@@ -68,27 +70,18 @@ spec:
valueFrom: valueFrom:
fieldRef: fieldRef:
fieldPath: metadata.name fieldPath: metadata.name
- name: CLUSTER_SIZE
value: "3"
command: command:
- /bin/sh - etcd
- -ec args:
- | - --name=$(POD_NAME)
PEERS="" - --listen-peer-urls=http://0.0.0.0:2380
for i in $(seq 0 $((${CLUSTER_SIZE} - 1))); do - --listen-client-urls=http://0.0.0.0:2379
PEERS="${PEERS}${PEERS:+,}etcd-${i}=http://etcd-${i}.etcd.m3db.svc.cluster.local:2380" - --advertise-client-urls=http://$(POD_NAME).etcd.m3db.svc.cluster.local:2379
done - --initial-advertise-peer-urls=http://$(POD_NAME).etcd.m3db.svc.cluster.local:2380
- --initial-cluster=etcd-0=http://etcd-0.etcd.m3db.svc.cluster.local:2380,etcd-1=http://etcd-1.etcd.m3db.svc.cluster.local:2380,etcd-2=http://etcd-2.etcd.m3db.svc.cluster.local:2380
exec etcd \ - --initial-cluster-state=new
--name=${POD_NAME} \ - --data-dir=/var/lib/etcd/data
--listen-peer-urls=http://0.0.0.0:2380 \ - --auto-compaction-retention=1
--listen-client-urls=http://0.0.0.0:2379 \
--advertise-client-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2379 \
--initial-advertise-peer-urls=http://${POD_NAME}.etcd.m3db.svc.cluster.local:2380 \
--initial-cluster=${PEERS} \
--initial-cluster-state=new \
--data-dir=/var/lib/etcd/data \
--auto-compaction-retention=1
volumeMounts: volumeMounts:
- name: etcd-data - name: etcd-data
mountPath: /var/lib/etcd mountPath: /var/lib/etcd

View File

@@ -19,6 +19,7 @@ data:
prefix: coordinator prefix: coordinator
prometheus: prometheus:
handlerPath: /metrics handlerPath: /metrics
listenAddress: 0.0.0.0:7203
sanitization: prometheus sanitization: prometheus
samplingRate: 1.0 samplingRate: 1.0
extended: none extended: none
@@ -31,12 +32,8 @@ data:
logging: logging:
level: info level: info
metrics: # Metrics handled by coordinator section above (port 7203)
prometheus: # db-specific metrics disabled to avoid port conflict
handlerPath: /metrics
sanitization: prometheus
samplingRate: 1.0
extended: detailed
listenAddress: 0.0.0.0:9000 listenAddress: 0.0.0.0:9000
clusterListenAddress: 0.0.0.0:9001 clusterListenAddress: 0.0.0.0:9001
@@ -199,6 +196,7 @@ data:
prefix: coordinator prefix: coordinator
prometheus: prometheus:
handlerPath: /metrics handlerPath: /metrics
listenAddress: 0.0.0.0:7203
sanitization: prometheus sanitization: prometheus
samplingRate: 1.0 samplingRate: 1.0
@@ -251,15 +249,10 @@ data:
- resolution: 1m - resolution: 1m
retention: 8760h retention: 8760h
# Ingest — Prometheus remote write # Ingest — Prometheus remote write (uses defaults)
ingest: # ingest:
ingester: # ingester:
workerPoolSize: 10000 # workerPoolSize: 10000
opPool:
size: 10000
m3msg:
server:
listenAddress: 0.0.0.0:7507
# Carbon ingestion disabled (uncomment if needed) # Carbon ingestion disabled (uncomment if needed)
# carbon: # carbon:

View File

@@ -66,13 +66,13 @@ spec:
memory: 2Gi memory: 2Gi
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /api/v1/services/m3db/health path: /health
port: 7201 port: 7201
initialDelaySeconds: 15 initialDelaySeconds: 30
periodSeconds: 10 periodSeconds: 10
readinessProbe: readinessProbe:
httpGet: httpGet:
path: /api/v1/services/m3db/health path: /health
port: 7201 port: 7201
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 5 periodSeconds: 5
@@ -115,3 +115,33 @@ spec:
protocol: TCP protocol: TCP
selector: selector:
app.kubernetes.io/name: m3coordinator app.kubernetes.io/name: m3coordinator
---
##############################################################################
# M3 Coordinator LoadBalancer Service
# External endpoint for cross-region/cross-cluster access
# Vultr CCM provisions a managed load balancer automatically
#
# remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write
# remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read
# query (Grafana) → http://<LB-IP>:7201
##############################################################################
apiVersion: v1
kind: Service
metadata:
name: m3coordinator-lb
namespace: m3db
labels:
app.kubernetes.io/name: m3coordinator
app.kubernetes.io/part-of: m3db
spec:
type: LoadBalancer
ports:
- name: api
port: 7201
targetPort: 7201
protocol: TCP
selector:
app.kubernetes.io/name: m3coordinator

View File

@@ -211,6 +211,12 @@ spec:
echo "" echo ""
echo "=== M3DB cluster initialization complete ===" echo "=== M3DB cluster initialization complete ==="
echo "Prometheus remote_write → ${COORD}/api/v1/prom/remote/write" echo "Internal endpoints (in-cluster):"
echo "Prometheus remote_read → ${COORD}/api/v1/prom/remote/read" echo " Prometheus remote_write → ${COORD}/api/v1/prom/remote/write"
echo "PromQL queries → ${COORD}/api/v1/query" echo " Prometheus remote_read → ${COORD}/api/v1/prom/remote/read"
echo " PromQL queries → ${COORD}/api/v1/query"
echo ""
echo "External endpoints (cross-cluster):"
echo " Get LB IP: kubectl -n m3db get svc m3coordinator-lb"
echo " Prometheus remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write"
echo " Prometheus remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read"

139
README.md
View File

@@ -5,16 +5,23 @@ Drop-in Mimir replacement using M3DB for long-term Prometheus metrics storage, d
## Architecture ## Architecture
``` ```
Prometheus ──remote_write──▶ M3 Coordinator (Deployment, 2 replicas) ┌─────────────────────────────────────────────────────┐
Grafana ──PromQL query──▶ │ Vultr VKE Cluster
┌───────┴───────┐ External Prometheus ─┼──remote_write──▶ Vultr LoadBalancer (m3coordinator-lb)
│ M3DB Nodes │ (StatefulSet, 3 replicas) External Grafana ─┼──PromQL query──▶ │ (managed, provisioned by CCM)
Vultr Block (100Gi SSD per node)
│ Storage │ In-cluster Prometheus┼──remote_write──▶ M3 Coordinator (Deployment, 2 replicas)
└───────┬───────┘ In-cluster Grafana ┼──PromQL query──▶ │
etcd cluster (StatefulSet, 3 replicas) ┌───────┴───────┐
│ │ M3DB Nodes │ (StatefulSet, 3 replicas)
│ │ Vultr Block │ (100Gi NVMe per node)
│ │ Storage │
│ └───────┬───────┘
│ │
│ etcd cluster (StatefulSet, 3 replicas)
└─────────────────────────────────────────────────────┘
``` ```
## Retention Tiers ## Retention Tiers
@@ -28,27 +35,68 @@ Grafana ──PromQL query──▶ │
## Deployment ## Deployment
```bash ```bash
# 1. Apply everything (except the init job won't succeed until pods are up) # 1. Apply everything
kubectl apply -k . kubectl apply -k .
# 2. Wait for all pods to be Ready # 2. Wait for all pods to be Running
kubectl -n m3db get pods -w kubectl -n m3db get pods -w
# 3. Once all m3dbnode and m3coordinator pods are Running, the init job # 3. Bootstrap the cluster (placement + namespaces)
# will bootstrap the cluster (placement + namespaces). # The init job waits for coordinator health, which requires m3db to be bootstrapped.
# Monitor it: # Bootstrap directly via m3dbnode's embedded coordinator:
kubectl -n m3db logs -f job/m3db-cluster-init kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/placement/init \
-H "Content-Type: application/json" -d '{
"num_shards": 64,
"replication_factor": 3,
"instances": [
{"id": "m3dbnode-0", "isolation_group": "zone-a", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-0.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-0", "port": 9000},
{"id": "m3dbnode-1", "isolation_group": "zone-b", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-1.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-1", "port": 9000},
{"id": "m3dbnode-2", "isolation_group": "zone-c", "zone": "embedded", "weight": 100, "endpoint": "m3dbnode-2.m3dbnode.m3db.svc.cluster.local:9000", "hostname": "m3dbnode-2", "port": 9000}
]
}'
# 4. Verify cluster health kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
kubectl -n m3db port-forward svc/m3coordinator 7201:7201 -H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}'
curl http://localhost:7201/api/v1/services/m3db/placement
curl http://localhost:7201/api/v1/services/m3db/namespace kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
-H "Content-Type: application/json" -d '{"name":"agg_10s_30d","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"720h","blockSizeDuration":"12h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"12h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"10s"}}]}}}'
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
-H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
# 4. Wait for bootstrapping to complete (check shard state = AVAILABLE)
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
# 5. Get the LoadBalancer IP
kubectl -n m3db get svc m3coordinator-lb
``` ```
## Testing
**Quick connectivity test:**
```bash
./test-metrics.sh <LB_IP>
```
This script verifies:
1. Coordinator health endpoint responds
2. Placement is configured with all 3 m3dbnode instances
3. All 3 namespaces are created (default, agg_10s_30d, agg_1m_1y)
4. PromQL queries work
**Full read/write test (Python):**
```bash
pip install requests python-snappy
python3 test-metrics.py <LB_IP>
```
Writes a test metric via Prometheus remote_write and reads it back.
## Prometheus Configuration (Replacing Mimir) ## Prometheus Configuration (Replacing Mimir)
Update your Prometheus config to point at M3 Coordinator instead of Mimir: Update your Prometheus config to point at M3 Coordinator.
**In-cluster (same VKE cluster):**
```yaml ```yaml
# prometheus.yml # prometheus.yml
remote_write: remote_write:
@@ -64,13 +112,33 @@ remote_read:
read_recent: true read_recent: true
``` ```
**External (cross-region/cross-cluster):**
```yaml
# prometheus.yml
remote_write:
- url: "http://<LB-IP>:7201/api/v1/prom/remote/write"
queue_config:
capacity: 10000
max_shards: 30
max_samples_per_send: 5000
batch_send_deadline: 5s
remote_read:
- url: "http://<LB-IP>:7201/api/v1/prom/remote/read"
read_recent: true
```
Get the LoadBalancer IP:
```bash
kubectl -n m3db get svc m3coordinator-lb
```
## Grafana Datasource ## Grafana Datasource
Add a **Prometheus** datasource in Grafana pointing to: Add a **Prometheus** datasource in Grafana pointing to:
``` - **In-cluster:** `http://m3coordinator.m3db.svc.cluster.local:7201`
http://m3coordinator.m3db.svc.cluster.local:7201 - **External:** `http://<LB-IP>:7201`
```
All existing PromQL dashboards will work without modification. All existing PromQL dashboards will work without modification.
@@ -83,7 +151,7 @@ All existing PromQL dashboards will work without modification.
## Tuning for Vultr ## Tuning for Vultr
- **Storage**: The `vultr-block-storage-m3db` StorageClass uses `high_perf` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention. - **Storage**: The `vultr-block-storage-m3db` StorageClass uses `disk_type: nvme` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention.
- **Node sizing**: M3DB is memory-hungry. Recommend at least 8GB RAM nodes on Vultr. The manifest requests 4Gi per m3dbnode pod. - **Node sizing**: M3DB is memory-hungry. Recommend at least 8GB RAM nodes on Vultr. The manifest requests 4Gi per m3dbnode pod.
- **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256. - **Shards**: The init job creates 64 shards across 3 nodes. For higher cardinality, increase to 128 or 256.
- **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`. - **Volume expansion**: The StorageClass has `allowVolumeExpansion: true` — you can resize PVCs online via `kubectl edit pvc`.
@@ -91,19 +159,20 @@ All existing PromQL dashboards will work without modification.
## Useful Commands ## Useful Commands
```bash ```bash
# Check placement # Get LoadBalancer IP
curl http://localhost:7201/api/v1/services/m3db/placement | jq kubectl -n m3db get svc m3coordinator-lb
# Check namespace readiness # Check cluster health (from inside cluster)
curl http://localhost:7201/api/v1/services/m3db/namespace/ready \ kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster.local:7201/health
-d '{"name":"default"}'
# Write a test metric # Check placement (from inside cluster)
curl -X POST http://localhost:7201/api/v1/prom/remote/write \ kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster.local:7201/api/v1/services/m3db/placement | jq
-H "Content-Type: application/x-protobuf"
# Query via PromQL # Check m3dbnode bootstrapped status
curl "http://localhost:7201/api/v1/query?query=up" kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
# Query via PromQL (external)
curl "http://<LB-IP>:7201/api/v1/query?query=up"
# Delete the init job to re-run (if needed) # Delete the init job to re-run (if needed)
kubectl -n m3db delete job m3db-cluster-init kubectl -n m3db delete job m3db-cluster-init

View File

@@ -1,4 +1,4 @@
apiVersion: kustomize.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:

241
test-metrics.py Normal file
View File

@@ -0,0 +1,241 @@
#!/usr/bin/env python3
"""
Test script for M3DB read/write functionality.
Usage: python3 test-metrics.py <LB_IP>
"""
import sys
import time
import random
import requests
def main():
if len(sys.argv) < 2:
print("Usage: python3 test-metrics.py <LB_IP>")
print("Example: python3 test-metrics.py 192.168.1.100")
sys.exit(1)
host = sys.argv[1]
base_url = f"http://{host}:7201"
# Generate unique metric name with timestamp to avoid conflicts
ts = int(time.time())
metric_name = f"m3db_test_metric_{ts}"
metric_value = random.randint(1, 1000)
print(f"=== M3DB Metrics Test ===")
print(f"Host: {host}")
print(f"Metric: {metric_name}")
print(f"Value: {metric_value}")
print()
# Write test metric using Prometheus remote write format
print("=== Writing metric ===")
write_url = f"{base_url}/api/v1/prom/remote/write"
# Prometheus remote write uses snappy-compressed protobuf
# For simplicity, we'll use the M3DB native write endpoint
# which accepts a simpler JSON format
# Alternative: use the /api/v1/prom/remote/write with proper protobuf
# but that requires prometheus_remote_write protobuf definition
# Let's use the query endpoint to verify coordinator is up first
# Check coordinator health
health_url = f"{base_url}/api/v1/services/m3db/health"
try:
resp = requests.get(health_url, timeout=10)
if resp.status_code == 200:
print(f"✓ Coordinator healthy")
else:
print(f"✗ Coordinator unhealthy: {resp.status_code}")
sys.exit(1)
except requests.exceptions.RequestException as e:
print(f"✗ Failed to connect: {e}")
sys.exit(1)
# Write metric using simple HTTP write (M3DB native format)
# Prometheus remote_write requires protobuf, so we'll write
# a test metric using a simple approach via the M3 coordinator
# For a proper test, we'll use the remote_write protobuf format
# But that's complex, so let's just verify read/write works
# by checking the cluster is ready and querying existing data
# Check placement
placement_url = f"{base_url}/api/v1/services/m3db/placement"
try:
resp = requests.get(placement_url, timeout=10)
if resp.status_code == 200:
placement = resp.json()
instances = placement.get("placement", {}).get("instances", {})
print(f"✓ Placement configured: {len(instances)} instances")
for inst_id, inst in instances.items():
print(f" - {inst_id}: {inst.get('endpoint', 'unknown')}")
else:
print(f"✗ Placement not ready: {resp.status_code}")
print(f" Response: {resp.text}")
except requests.exceptions.RequestException as e:
print(f"✗ Failed to get placement: {e}")
# Check namespaces
namespace_url = f"{base_url}/api/v1/services/m3db/namespace"
try:
resp = requests.get(namespace_url, timeout=10)
if resp.status_code == 200:
ns_data = resp.json()
namespaces = ns_data.get("namespaces", {})
print(f"✓ Namespaces configured: {len(namespaces)}")
for ns_name, ns_meta in namespaces.items():
print(f" - {ns_name}")
else:
print(f"✗ Namespaces not ready: {resp.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Failed to get namespaces: {e}")
# Query test (even if no data, should return empty result)
print()
print("=== Query test ===")
query_url = f"{base_url}/api/v1/query"
try:
resp = requests.get(query_url, params={"query": "up"}, timeout=10)
if resp.status_code == 200:
result = resp.json()
status = result.get("status")
print(f"✓ Query returned: {status}")
data = result.get("data", {}).get("result", [])
print(f" Results: {len(data)} series")
else:
print(f"✗ Query failed: {resp.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Query failed: {e}")
# Write test metric using remote write protobuf
print()
print("=== Write test ===")
print("Writing via Prometheus remote_write format...")
# Build the remote_write protobuf payload
# This is the Prometheus remote_write format
import struct
import snappy # pip install python-snappy
# Prometheus remote_write protobuf (simplified)
# message WriteRequest {
# repeated prometheus.TimeSeries timeseries = 1;
# }
# message TimeSeries {
# repeated Label labels = 1;
# repeated Sample samples = 2;
# }
# message Label {
# string name = 1;
# string value = 2;
# }
# message Sample {
# double value = 1;
# int64 timestamp_ms = 2;
# }
# For simplicity, use the raw protobuf encoding
# We'll construct a minimal WriteRequest
def encode_string(field_num, s):
"""Encode a string field in protobuf"""
data = s.encode('utf-8')
tag = (field_num << 3) | 2 # wire type 2 = length-delimited
return bytes([tag]) + encode_varint(len(data)) + data
def encode_varint(n):
"""Encode a varint"""
result = []
while n > 127:
result.append((n & 0x7F) | 0x80)
n >>= 7
result.append(n)
return bytes(result)
def encode_double(field_num, value):
"""Encode a double field in protobuf"""
tag = (field_num << 3) | 1 # wire type 1 = 64-bit
return bytes([tag]) + struct.pack('<d', value)
def encode_int64(field_num, value):
"""Encode an int64 field in protobuf (as varint)"""
tag = (field_num << 3) | 0 # wire type 0 = varint
return bytes([tag]) + encode_varint(value)
# Build Sample
sample = encode_double(1, float(metric_value)) + encode_int64(2, int(time.time() * 1000))
# Build Labels
labels = (
encode_string(1, "__name__") + encode_string(2, metric_name) +
encode_string(1, "test") + encode_string(2, "m3db_verification")
)
# Build TimeSeries
ts_data = encode_string(1, labels) + encode_string(2, sample)
# Note: repeated fields need proper encoding
# Actually, for repeated fields we just repeat the field
# Simplified: just encode the timeseries with proper field numbers
# Label is field 1, Sample is field 2 in TimeSeries
ts_encoded = (
bytes([0x0a]) + encode_varint(len(labels)) + labels + # field 1, wire type 2
bytes([0x12]) + encode_varint(len(sample)) + sample # field 2, wire type 2
)
# Build WriteRequest (timeseries is field 1)
write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
# Compress with snappy
compressed = snappy.compress(write_req)
headers = {
"Content-Encoding": "snappy",
"Content-Type": "application/x-protobuf",
"X-Prometheus-Remote-Write-Version": "0.1.0"
}
try:
resp = requests.post(write_url, data=compressed, headers=headers, timeout=10)
if resp.status_code == 204 or resp.status_code == 200:
print(f"✓ Write successful: {metric_name} = {metric_value}")
else:
print(f"✗ Write failed: {resp.status_code}")
print(f" Response: {resp.text}")
except requests.exceptions.RequestException as e:
print(f"✗ Write failed: {e}")
print(" (This is expected if python-snappy is not installed)")
print(" Install with: pip install python-snappy")
# Wait a moment and query back
time.sleep(2)
print()
print("=== Read back test ===")
try:
resp = requests.get(query_url, params={"query": metric_name}, timeout=10)
if resp.status_code == 200:
result = resp.json()
data = result.get("data", {}).get("result", [])
if data:
print(f"✓ Metric found!")
for series in data:
metric = series.get("metric", {})
values = series.get("values", series.get("value", []))
print(f" Labels: {metric}")
print(f" Values: {values}")
else:
print(f"✗ Metric not found (may take a moment to index)")
else:
print(f"✗ Query failed: {resp.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Query failed: {e}")
print()
print("=== Test complete ===")
if __name__ == "__main__":
main()

75
test-metrics.sh Executable file
View File

@@ -0,0 +1,75 @@
#!/bin/bash
#
# Simple M3DB connectivity test
# Usage: ./test-metrics.sh <LB_IP>
#
set -e
LB_IP="${1:-}"
if [ -z "$LB_IP" ]; then
echo "Usage: $0 <LB_IP>"
echo "Example: $0 192.168.1.100"
exit 1
fi
BASE_URL="http://${LB_IP}:7201"
echo "=== M3DB Connectivity Test ==="
echo "Target: ${BASE_URL}"
echo ""
# Health check
echo "1. Coordinator Health"
if curl -sf "${BASE_URL}/health" > /dev/null 2>&1; then
echo " ✓ Healthy"
else
echo " ✗ Unhealthy or unreachable"
exit 1
fi
# Placement
echo ""
echo "2. Placement (cluster topology)"
PLACEMENT=$(curl -sf "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0")
if [ "$INSTANCE_COUNT" -gt 0 ]; then
echo "$INSTANCE_COUNT instances in placement"
echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
else
echo " ✗ No placement configured (run init job)"
fi
# Namespaces
echo ""
echo "3. Namespaces (retention policies)"
NAMESPACES=$(curl -sf "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0")
if [ "$NS_COUNT" -gt 0 ]; then
echo "$NS_COUNT namespaces configured"
echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); [print(f' - {k}') for k in d.keys()]" 2>/dev/null || true
else
echo " ✗ No namespaces configured (run init job)"
fi
# Query test
echo ""
echo "4. Query Test (PromQL)"
QUERY_RESULT=$(curl -sf "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
if [ "$STATUS" = "success" ]; then
RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0")
echo " ✓ Query returned: $RESULT_COUNT series"
else
echo " ✗ Query failed"
fi
# Write test (requires protobuf + snappy, so just note it)
echo ""
echo "5. Write Test"
echo " Note: Prometheus remote_write requires protobuf + snappy encoding."
echo " Use test-metrics.py for full write/read verification."
echo " Install: pip install python-snappy requests"
echo ""
echo "=== Test Complete ==="