Replace LB with Traefik ingress for TLS + basic auth
- Remove m3coordinator LoadBalancer service (was using deprecated AutoSSL) - Add Traefik ingress controller with Let's Encrypt ACME - Add basic auth middleware for external access - Update test scripts with auth support and fixed protobuf encoding - Add multi-tenancy documentation (label-based isolation) - Update README with Traefik deployment instructions
This commit is contained in:
@@ -115,36 +115,3 @@ spec:
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
|
||||
---
|
||||
|
||||
##############################################################################
|
||||
# M3 Coordinator LoadBalancer Service
|
||||
# External endpoint for cross-region/cross-cluster access
|
||||
# Vultr CCM provisions a managed load balancer automatically
|
||||
#
|
||||
# remote_write → http://<LB-IP>:7201/api/v1/prom/remote/write
|
||||
# remote_read → http://<LB-IP>:7201/api/v1/prom/remote/read
|
||||
# query (Grafana) → http://<LB-IP>:7201
|
||||
##############################################################################
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: m3coordinator-lb
|
||||
namespace: m3db
|
||||
labels:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
app.kubernetes.io/part-of: m3db
|
||||
annotations:
|
||||
# Backend speaks HTTP so LB can do health checks and proper proxying
|
||||
service.beta.kubernetes.io/vultr-loadbalancer-backend-protocol: "http"
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
ports:
|
||||
- name: api
|
||||
port: 7201
|
||||
targetPort: 7201
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/name: m3coordinator
|
||||
|
||||
32
08-basic-auth-middleware.yaml
Normal file
32
08-basic-auth-middleware.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
##############################################################################
|
||||
# Basic Auth Middleware for Traefik
|
||||
# CHANGE THE PASSWORD BEFORE PRODUCTION USE!
|
||||
#
|
||||
# To generate a new htpasswd entry:
|
||||
# htpasswd -nb <username> <password>
|
||||
# Then base64 encode it:
|
||||
# echo -n '<htpasswd-output>' | base64
|
||||
# Update the secret below with the new value.
|
||||
##############################################################################
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: basic-auth-secret
|
||||
namespace: m3db
|
||||
type: Opaque
|
||||
# htpasswd -nb example example
|
||||
stringData:
|
||||
users: |-
|
||||
example:$apr1$oMBgtfpd$CBTS17sDq7GN58qaoIMvh.
|
||||
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: basic-auth
|
||||
namespace: m3db
|
||||
spec:
|
||||
basicAuth:
|
||||
secret: basic-auth-secret
|
||||
59
09-m3coordinator-ingressroute.yaml
Normal file
59
09-m3coordinator-ingressroute.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
##############################################################################
|
||||
# M3 Coordinator IngressRoute
|
||||
# Traefik handles TLS termination + basic auth
|
||||
# External: https://m3db.vultrlabs.dev → Traefik → m3coordinator:7201
|
||||
##############################################################################
|
||||
|
||||
---
|
||||
# HTTP redirect to HTTPS
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: m3coordinator-redirect
|
||||
namespace: m3db
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`m3db.vultrlabs.dev`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: redirect-https
|
||||
namespace: m3db
|
||||
services:
|
||||
- name: m3coordinator
|
||||
port: 7201
|
||||
|
||||
---
|
||||
# HTTPS with basic auth
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: m3coordinator
|
||||
namespace: m3db
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`m3db.vultrlabs.dev`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: basic-auth
|
||||
namespace: m3db
|
||||
services:
|
||||
- name: m3coordinator
|
||||
port: 7201
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
|
||||
---
|
||||
# HTTPS redirect middleware
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: redirect-https
|
||||
namespace: m3db
|
||||
spec:
|
||||
redirectScheme:
|
||||
scheme: https
|
||||
permanent: true
|
||||
187
README.md
187
README.md
@@ -2,28 +2,46 @@
|
||||
|
||||
Drop-in Mimir replacement using M3DB for long-term Prometheus metrics storage, deployed on Vultr VKE with Vultr Block Storage CSI.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **kubectl** — for applying manifests
|
||||
- **helm** — for installing Traefik Ingress Controller
|
||||
|
||||
```bash
|
||||
# Install helm (macOS/Linux with Homebrew)
|
||||
brew install helm
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ Vultr VKE Cluster │
|
||||
│ │
|
||||
External Prometheus ─┼──remote_write──▶ Vultr LoadBalancer (m3coordinator-lb)
|
||||
External Grafana ─┼──PromQL query──▶ │ (managed, provisioned by CCM)
|
||||
│ │
|
||||
In-cluster Prometheus┼──remote_write──▶ M3 Coordinator (Deployment, 2 replicas)
|
||||
In-cluster Grafana ┼──PromQL query──▶ │
|
||||
│ │
|
||||
│ ┌───────┴───────┐
|
||||
│ │ M3DB Nodes │ (StatefulSet, 3 replicas)
|
||||
│ │ Vultr Block │ (100Gi NVMe per node)
|
||||
│ │ Storage │
|
||||
│ └───────┬───────┘
|
||||
│ │
|
||||
│ etcd cluster (StatefulSet, 3 replicas)
|
||||
External Prometheus ─┼──remote_write──▶ Traefik Ingress (LoadBalancer) │
|
||||
External Grafana ─┼──PromQL query──▶ │ TLS termination, basic auth │
|
||||
│ │ │
|
||||
│ ┌──────┴──────┐ │
|
||||
│ │ M3 Coordinator (Deployment, 2 replicas)
|
||||
In-cluster Prometheus┼──remote_write──▶ │ │
|
||||
In-cluster Grafana ┼──PromQL query──▶ │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ ┌────┴────┐ │
|
||||
│ │ M3DB Nodes │ (StatefulSet, 3 replicas)
|
||||
│ │ Vultr Block│ (100Gi NVMe per node) │
|
||||
│ │ Storage │ │
|
||||
│ └────┬────┘ │
|
||||
│ │ │
|
||||
│ etcd cluster (StatefulSet, 3 replicas)
|
||||
└─────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**External access flow:**
|
||||
```
|
||||
Internet → Vultr LoadBalancer → Traefik (TLS + basic auth) → m3coordinator:7201
|
||||
```
|
||||
|
||||
## Retention Tiers
|
||||
|
||||
| Namespace | Resolution | Retention | Use Case |
|
||||
@@ -35,15 +53,36 @@ In-cluster Grafana ┼──PromQL query──▶ │
|
||||
## Deployment
|
||||
|
||||
```bash
|
||||
# 1. Apply everything
|
||||
# 1. Install Traefik Ingress Controller (handles TLS + basic auth)
|
||||
helm repo add traefik https://traefik.github.io/charts
|
||||
helm repo update
|
||||
helm install traefik traefik/traefik \
|
||||
--namespace traefik --create-namespace \
|
||||
--set 'additionalArguments[0]=--certificatesresolvers.letsencrypt.acme.email=your-email@example.com' \
|
||||
--set 'additionalArguments[1]=--certificatesresolvers.letsencrypt.acme.storage=/data/acme.json' \
|
||||
--set 'additionalArguments[2]=--certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web'
|
||||
|
||||
# Note: ACME requires single replica. For HA, use external cert management
|
||||
# or Traefik Enterprise with distributed ACME storage.
|
||||
|
||||
# 2. Get the Traefik LoadBalancer IP and update DNS
|
||||
kubectl -n traefik get svc traefik
|
||||
# Point your domain (e.g., m3db.vultrlabs.dev) to this IP
|
||||
|
||||
# 3. Apply M3DB manifests
|
||||
kubectl apply -k .
|
||||
|
||||
# 2. Wait for all pods to be Running
|
||||
# 4. Wait for all pods to be Running
|
||||
kubectl -n m3db get pods -w
|
||||
```
|
||||
|
||||
# 3. Bootstrap the cluster (placement + namespaces)
|
||||
# The init job waits for coordinator health, which requires m3db to be bootstrapped.
|
||||
# Bootstrap directly via m3dbnode's embedded coordinator:
|
||||
## Bootstrap M3DB Cluster
|
||||
|
||||
The init job waits for coordinator health, which requires m3db to be bootstrapped.
|
||||
Bootstrap directly via m3dbnode's embedded coordinator:
|
||||
|
||||
```bash
|
||||
# Initialize placement
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/placement/init \
|
||||
-H "Content-Type: application/json" -d '{
|
||||
"num_shards": 64,
|
||||
@@ -55,6 +94,7 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/
|
||||
]
|
||||
}'
|
||||
|
||||
# Create namespaces
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||
-H "Content-Type: application/json" -d '{"name":"default","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"repairEnabled":false,"retentionOptions":{"retentionPeriodDuration":"48h","blockSizeDuration":"2h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"2h"}}}'
|
||||
|
||||
@@ -64,37 +104,43 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s -X POST http://localhost:7201/api/v1/services/m3db/namespace \
|
||||
-H "Content-Type: application/json" -d '{"name":"agg_1m_1y","options":{"bootstrapEnabled":true,"flushEnabled":true,"writesToCommitLog":true,"cleanupEnabled":true,"snapshotEnabled":true,"retentionOptions":{"retentionPeriodDuration":"8760h","blockSizeDuration":"24h","bufferFutureDuration":"10m","bufferPastDuration":"10m"},"indexOptions":{"enabled":true,"blockSizeDuration":"24h"},"aggregationOptions":{"aggregations":[{"aggregated":true,"attributes":{"resolutionDuration":"1m"}}]}}}'
|
||||
|
||||
# 4. Wait for bootstrapping to complete (check shard state = AVAILABLE)
|
||||
# Wait for bootstrapping to complete (check shard state = AVAILABLE)
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
|
||||
```
|
||||
|
||||
# 5. Get the LoadBalancer IP
|
||||
kubectl -n m3db get svc m3coordinator-lb
|
||||
## Authentication
|
||||
|
||||
External access is protected by HTTP basic auth. Update the password in `08-basic-auth-middleware.yaml`:
|
||||
|
||||
```bash
|
||||
# Generate new htpasswd entry
|
||||
htpasswd -nb <username> <password>
|
||||
|
||||
# Update the secret stringData.users field and apply
|
||||
kubectl apply -f 08-basic-auth-middleware.yaml
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
**Quick connectivity test:**
|
||||
```bash
|
||||
./test-metrics.sh <BASE_URL>
|
||||
# Example:
|
||||
./test-metrics.sh http://m3db.vultrlabs.dev:7201
|
||||
```
|
||||
# With basic auth (external)
|
||||
./test-metrics.sh https://m3db.vultrlabs.dev example example
|
||||
|
||||
This script verifies:
|
||||
1. Coordinator health endpoint responds
|
||||
2. Placement is configured with all 3 m3dbnode instances
|
||||
3. All 3 namespaces are created (default, agg_10s_30d, agg_1m_1y)
|
||||
4. PromQL queries work
|
||||
# Without auth (in-cluster or port-forward)
|
||||
./test-metrics.sh http://m3coordinator.m3db.svc.cluster.local:7201
|
||||
```
|
||||
|
||||
**Full read/write test (Python):**
|
||||
```bash
|
||||
pip install requests python-snappy
|
||||
python3 test-metrics.py <BASE_URL>
|
||||
# Example:
|
||||
python3 test-metrics.py http://m3db.vultrlabs.dev:7201
|
||||
```
|
||||
|
||||
Writes a test metric via Prometheus remote_write and reads it back.
|
||||
# With basic auth (external)
|
||||
python3 test-metrics.py https://m3db.vultrlabs.dev example example
|
||||
|
||||
# Without auth (in-cluster or port-forward)
|
||||
python3 test-metrics.py http://m3coordinator.m3db.svc.cluster.local:7201
|
||||
```
|
||||
|
||||
## Prometheus Configuration (Replacing Mimir)
|
||||
|
||||
@@ -120,7 +166,10 @@ remote_read:
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
remote_write:
|
||||
- url: "http://m3db.vultrlabs.dev:7201/api/v1/prom/remote/write"
|
||||
- url: "https://m3db.vultrlabs.dev/api/v1/prom/remote/write"
|
||||
basic_auth:
|
||||
username: example
|
||||
password: example
|
||||
queue_config:
|
||||
capacity: 10000
|
||||
max_shards: 30
|
||||
@@ -128,21 +177,19 @@ remote_write:
|
||||
batch_send_deadline: 5s
|
||||
|
||||
remote_read:
|
||||
- url: "http://m3db.vultrlabs.dev:7201/api/v1/prom/remote/read"
|
||||
- url: "https://m3db.vultrlabs.dev/api/v1/prom/remote/read"
|
||||
basic_auth:
|
||||
username: example
|
||||
password: example
|
||||
read_recent: true
|
||||
```
|
||||
|
||||
Get the LoadBalancer IP:
|
||||
```bash
|
||||
kubectl -n m3db get svc m3coordinator-lb
|
||||
```
|
||||
|
||||
## Grafana Datasource
|
||||
|
||||
Add a **Prometheus** datasource in Grafana pointing to:
|
||||
|
||||
- **In-cluster:** `http://m3coordinator.m3db.svc.cluster.local:7201`
|
||||
- **External:** `http://m3db.vultrlabs.dev:7201`
|
||||
- **External:** `https://m3db.vultrlabs.dev` (with basic auth)
|
||||
|
||||
All existing PromQL dashboards will work without modification.
|
||||
|
||||
@@ -153,6 +200,49 @@ All existing PromQL dashboards will work without modification.
|
||||
3. **Cutover**: Once retention in M3DB covers your needs, remove the Mimir remote_write target.
|
||||
4. **Cleanup**: Decommission Mimir components.
|
||||
|
||||
## Multi-Tenancy (Label-Based)
|
||||
|
||||
M3DB uses Prometheus-style labels for tenant isolation. Add labels like `tenant`, `service`, `env` to your metrics to differentiate between sources.
|
||||
|
||||
**Write metrics with tenant labels:**
|
||||
```python
|
||||
# In your Prometheus remote_write client
|
||||
labels = {
|
||||
"tenant": "acme-corp",
|
||||
"service": "api-gateway",
|
||||
"env": "prod"
|
||||
}
|
||||
# Metric: http_requests_total{tenant="acme-corp", service="api-gateway", env="prod"}
|
||||
```
|
||||
|
||||
**Query by tenant:**
|
||||
```bash
|
||||
# All metrics from a specific tenant
|
||||
curl -u example:example "https://m3db.vultrlabs.dev/api/v1/query?query=http_requests_total{tenant=\"acme-corp\"}"
|
||||
|
||||
# Filter by service within tenant
|
||||
curl -u example:example "https://m3db.vultrlabs.dev/api/v1/query?query=http_requests_total{tenant=\"acme-corp\",service=\"api-gateway\"}"
|
||||
|
||||
# Filter by environment
|
||||
curl -u example:example "https://m3db.vultrlabs.dev/api/v1/query?query=http_requests_total{env=\"prod\"}"
|
||||
```
|
||||
|
||||
**Prometheus configuration with labels:**
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
remote_write:
|
||||
- url: "https://m3db.vultrlabs.dev/api/v1/prom/remote/write"
|
||||
basic_auth:
|
||||
username: example
|
||||
password: example
|
||||
# Add tenant labels to all metrics from this Prometheus
|
||||
write_relabel_configs:
|
||||
- target_label: tenant
|
||||
replacement: acme-corp
|
||||
- target_label: env
|
||||
replacement: prod
|
||||
```
|
||||
|
||||
## Tuning for Vultr
|
||||
|
||||
- **Storage**: The `vultr-block-storage-m3db` StorageClass uses `disk_type: nvme` (NVMe SSD). Adjust `storage` in the VolumeClaimTemplates based on your cardinality and retention.
|
||||
@@ -163,8 +253,8 @@ All existing PromQL dashboards will work without modification.
|
||||
## Useful Commands
|
||||
|
||||
```bash
|
||||
# Get LoadBalancer IP
|
||||
kubectl -n m3db get svc m3coordinator-lb
|
||||
# Get Traefik LoadBalancer IP
|
||||
kubectl -n traefik get svc traefik
|
||||
|
||||
# Check cluster health (from inside cluster)
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster.local:7201/health
|
||||
@@ -175,10 +265,13 @@ kubectl -n m3db exec m3dbnode-0 -- curl -s http://m3coordinator.m3db.svc.cluster
|
||||
# Check m3dbnode bootstrapped status
|
||||
kubectl -n m3db exec m3dbnode-0 -- curl -s http://localhost:9002/health
|
||||
|
||||
# Query via PromQL (external)
|
||||
curl "http://<LB-IP>:7201/api/v1/query?query=up"
|
||||
# Query via PromQL (external with auth)
|
||||
curl -u example:example "https://m3db.vultrlabs.dev/api/v1/query?query=up"
|
||||
|
||||
# Delete the init job to re-run (if needed)
|
||||
kubectl -n m3db delete job m3db-cluster-init
|
||||
kubectl apply -f 06-init-and-pdb.yaml
|
||||
|
||||
# View Traefik logs
|
||||
kubectl -n traefik logs -l app.kubernetes.io/name=traefik
|
||||
```
|
||||
|
||||
224
test-metrics.py
224
test-metrics.py
@@ -1,7 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for M3DB read/write functionality.
|
||||
Usage: python3 test-metrics.py <BASE_URL>
|
||||
Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]
|
||||
|
||||
Examples:
|
||||
python3 test-metrics.py https://m3db.vultrlabs.dev example example
|
||||
python3 test-metrics.py http://192.168.1.100:7201
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -9,44 +13,37 @@ import time
|
||||
import random
|
||||
import requests
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 test-metrics.py <BASE_URL>")
|
||||
print("Example: python3 test-metrics.py https://m3db.vultrlabs.dev:7201")
|
||||
print("Usage: python3 test-metrics.py <BASE_URL> [USERNAME] [PASSWORD]")
|
||||
print("Example: python3 test-metrics.py https://m3db.vultrlabs.dev example example")
|
||||
print(" python3 test-metrics.py http://192.168.1.100:7201")
|
||||
sys.exit(1)
|
||||
|
||||
base_url = sys.argv[1].rstrip('/')
|
||||
username = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
password = sys.argv[3] if len(sys.argv) > 3 else None
|
||||
|
||||
# Generate unique metric name with timestamp to avoid conflicts
|
||||
ts = int(time.time())
|
||||
metric_name = f"m3db_test_metric_{ts}"
|
||||
metric_value = random.randint(1, 1000)
|
||||
# Setup auth if provided
|
||||
auth = (username, password) if username and password else None
|
||||
|
||||
print(f"=== M3DB Metrics Test ===")
|
||||
print(f"URL: {base_url}")
|
||||
print(f"Metric: {metric_name}")
|
||||
print(f"Value: {metric_value}")
|
||||
if auth:
|
||||
print(f"Auth: {username}:***")
|
||||
print()
|
||||
|
||||
# Write test metric using Prometheus remote write format
|
||||
print("=== Writing metric ===")
|
||||
write_url = f"{base_url}/api/v1/prom/remote/write"
|
||||
|
||||
# Prometheus remote write uses snappy-compressed protobuf
|
||||
# For simplicity, we'll use the M3DB native write endpoint
|
||||
# which accepts a simpler JSON format
|
||||
|
||||
# Alternative: use the /api/v1/prom/remote/write with proper protobuf
|
||||
# but that requires prometheus_remote_write protobuf definition
|
||||
# Let's use the query endpoint to verify coordinator is up first
|
||||
|
||||
# Check coordinator health
|
||||
print("=== Health Check ===")
|
||||
health_url = f"{base_url}/health"
|
||||
try:
|
||||
resp = requests.get(health_url, timeout=10)
|
||||
resp = requests.get(health_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
print(f"✓ Coordinator healthy")
|
||||
elif resp.status_code == 401:
|
||||
print(f"✗ Authentication required. Provide username and password.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"✗ Coordinator unhealthy: {resp.status_code}")
|
||||
sys.exit(1)
|
||||
@@ -54,18 +51,12 @@ def main():
|
||||
print(f"✗ Failed to connect: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Write metric using simple HTTP write (M3DB native format)
|
||||
# Prometheus remote_write requires protobuf, so we'll write
|
||||
# a test metric using a simple approach via the M3 coordinator
|
||||
|
||||
# For a proper test, we'll use the remote_write protobuf format
|
||||
# But that's complex, so let's just verify read/write works
|
||||
# by checking the cluster is ready and querying existing data
|
||||
|
||||
# Check placement
|
||||
print()
|
||||
print("=== Placement ===")
|
||||
placement_url = f"{base_url}/api/v1/services/m3db/placement"
|
||||
try:
|
||||
resp = requests.get(placement_url, timeout=10)
|
||||
resp = requests.get(placement_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
placement = resp.json()
|
||||
instances = placement.get("placement", {}).get("instances", {})
|
||||
@@ -79,26 +70,28 @@ def main():
|
||||
print(f"✗ Failed to get placement: {e}")
|
||||
|
||||
# Check namespaces
|
||||
print()
|
||||
print("=== Namespaces ===")
|
||||
namespace_url = f"{base_url}/api/v1/services/m3db/namespace"
|
||||
try:
|
||||
resp = requests.get(namespace_url, timeout=10)
|
||||
resp = requests.get(namespace_url, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
ns_data = resp.json()
|
||||
namespaces = ns_data.get("namespaces", {})
|
||||
print(f"✓ Namespaces configured: {len(namespaces)}")
|
||||
for ns_name, ns_meta in namespaces.items():
|
||||
for ns_name in namespaces.keys():
|
||||
print(f" - {ns_name}")
|
||||
else:
|
||||
print(f"✗ Namespaces not ready: {resp.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Failed to get namespaces: {e}")
|
||||
|
||||
# Query test (even if no data, should return empty result)
|
||||
# Query test
|
||||
print()
|
||||
print("=== Query test ===")
|
||||
print("=== Query Test ===")
|
||||
query_url = f"{base_url}/api/v1/query"
|
||||
try:
|
||||
resp = requests.get(query_url, params={"query": "up"}, timeout=10)
|
||||
resp = requests.get(query_url, params={"query": "up"}, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
status = result.get("status")
|
||||
@@ -110,41 +103,23 @@ def main():
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Query failed: {e}")
|
||||
|
||||
# Write test metric using remote write protobuf
|
||||
# Write test using Prometheus remote_write
|
||||
print()
|
||||
print("=== Write test ===")
|
||||
print("Writing via Prometheus remote_write format...")
|
||||
print("=== Write Test ===")
|
||||
print("Writing metrics via Prometheus remote_write format...")
|
||||
|
||||
# Build the remote_write protobuf payload
|
||||
# This is the Prometheus remote_write format
|
||||
import struct
|
||||
import snappy # pip install python-snappy
|
||||
try:
|
||||
import struct
|
||||
import snappy # pip install python-snappy
|
||||
except ImportError:
|
||||
print("✗ Missing dependencies for write test")
|
||||
print(" Install with: pip install python-snappy")
|
||||
print(" Skipping write test...")
|
||||
print()
|
||||
print("=== Test complete (read-only) ===")
|
||||
return
|
||||
|
||||
# Prometheus remote_write protobuf (simplified)
|
||||
# message WriteRequest {
|
||||
# repeated prometheus.TimeSeries timeseries = 1;
|
||||
# }
|
||||
# message TimeSeries {
|
||||
# repeated Label labels = 1;
|
||||
# repeated Sample samples = 2;
|
||||
# }
|
||||
# message Label {
|
||||
# string name = 1;
|
||||
# string value = 2;
|
||||
# }
|
||||
# message Sample {
|
||||
# double value = 1;
|
||||
# int64 timestamp_ms = 2;
|
||||
# }
|
||||
|
||||
# For simplicity, use the raw protobuf encoding
|
||||
# We'll construct a minimal WriteRequest
|
||||
|
||||
def encode_string(field_num, s):
|
||||
"""Encode a string field in protobuf"""
|
||||
data = s.encode('utf-8')
|
||||
tag = (field_num << 3) | 2 # wire type 2 = length-delimited
|
||||
return bytes([tag]) + encode_varint(len(data)) + data
|
||||
write_url = f"{base_url}/api/v1/prom/remote/write"
|
||||
|
||||
def encode_varint(n):
|
||||
"""Encode a varint"""
|
||||
@@ -155,68 +130,84 @@ def main():
|
||||
result.append(n)
|
||||
return bytes(result)
|
||||
|
||||
def encode_string(field_num, s):
|
||||
"""Encode a string field in protobuf"""
|
||||
data = s.encode('utf-8')
|
||||
tag = (field_num << 3) | 2
|
||||
return bytes([tag]) + encode_varint(len(data)) + data
|
||||
|
||||
def encode_double(field_num, value):
|
||||
"""Encode a double field in protobuf"""
|
||||
tag = (field_num << 3) | 1 # wire type 1 = 64-bit
|
||||
tag = (field_num << 3) | 1
|
||||
return bytes([tag]) + struct.pack('<d', value)
|
||||
|
||||
def encode_int64(field_num, value):
|
||||
"""Encode an int64 field in protobuf (as varint)"""
|
||||
tag = (field_num << 3) | 0 # wire type 0 = varint
|
||||
tag = (field_num << 3) | 0
|
||||
return bytes([tag]) + encode_varint(value)
|
||||
|
||||
# Build Sample
|
||||
sample = encode_double(1, float(metric_value)) + encode_int64(2, int(time.time() * 1000))
|
||||
def encode_label(name, value):
|
||||
"""Encode a single Label message"""
|
||||
return encode_string(1, name) + encode_string(2, value)
|
||||
|
||||
# Build Labels
|
||||
labels = (
|
||||
encode_string(1, "__name__") + encode_string(2, metric_name) +
|
||||
encode_string(1, "test") + encode_string(2, "m3db_verification")
|
||||
)
|
||||
def write_metric(name, value, labels_dict):
|
||||
"""Write a metric with custom labels"""
|
||||
ts_ms = int(time.time() * 1000)
|
||||
|
||||
# Build TimeSeries
|
||||
ts_data = encode_string(1, labels) + encode_string(2, sample)
|
||||
# Note: repeated fields need proper encoding
|
||||
# Actually, for repeated fields we just repeat the field
|
||||
# Build all labels as repeated Label messages
|
||||
labels_data = b''
|
||||
|
||||
# Simplified: just encode the timeseries with proper field numbers
|
||||
# Label is field 1, Sample is field 2 in TimeSeries
|
||||
ts_encoded = (
|
||||
bytes([0x0a]) + encode_varint(len(labels)) + labels + # field 1, wire type 2
|
||||
bytes([0x12]) + encode_varint(len(sample)) + sample # field 2, wire type 2
|
||||
)
|
||||
# __name__ label first
|
||||
labels_data += bytes([0x0a]) + encode_varint(len(encode_label("__name__", name))) + encode_label("__name__", name)
|
||||
|
||||
# Build WriteRequest (timeseries is field 1)
|
||||
write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
|
||||
# Then custom labels
|
||||
for k, v in labels_dict.items():
|
||||
label_msg = encode_label(k, v)
|
||||
labels_data += bytes([0x0a]) + encode_varint(len(label_msg)) + label_msg
|
||||
|
||||
# Compress with snappy
|
||||
compressed = snappy.compress(write_req)
|
||||
# Build Sample (field 2 in TimeSeries)
|
||||
sample = encode_double(1, float(value)) + encode_int64(2, ts_ms)
|
||||
|
||||
headers = {
|
||||
"Content-Encoding": "snappy",
|
||||
"Content-Type": "application/x-protobuf",
|
||||
"X-Prometheus-Remote-Write-Version": "0.1.0"
|
||||
}
|
||||
# Build TimeSeries
|
||||
ts_encoded = labels_data + bytes([0x12]) + encode_varint(len(sample)) + sample
|
||||
|
||||
try:
|
||||
resp = requests.post(write_url, data=compressed, headers=headers, timeout=10)
|
||||
if resp.status_code == 204 or resp.status_code == 200:
|
||||
print(f"✓ Write successful: {metric_name} = {metric_value}")
|
||||
else:
|
||||
print(f"✗ Write failed: {resp.status_code}")
|
||||
print(f" Response: {resp.text}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Write failed: {e}")
|
||||
print(" (This is expected if python-snappy is not installed)")
|
||||
print(" Install with: pip install python-snappy")
|
||||
# Build WriteRequest
|
||||
write_req = bytes([0x0a]) + encode_varint(len(ts_encoded)) + ts_encoded
|
||||
|
||||
# Wait a moment and query back
|
||||
# Compress with snappy
|
||||
compressed = snappy.compress(write_req)
|
||||
|
||||
headers = {
|
||||
"Content-Encoding": "snappy",
|
||||
"Content-Type": "application/x-protobuf",
|
||||
"X-Prometheus-Remote-Write-Version": "0.1.0"
|
||||
}
|
||||
|
||||
resp = requests.post(write_url, data=compressed, headers=headers, auth=auth, timeout=10)
|
||||
return resp.status_code
|
||||
|
||||
# Write test metrics with tenant labels
|
||||
print()
|
||||
tenants = [
|
||||
{"tenant": "test-tenant", "service": "api", "env": "test"},
|
||||
]
|
||||
|
||||
ts = int(time.time())
|
||||
for labels in tenants:
|
||||
metric_name = f"test_metric_{ts}"
|
||||
metric_value = random.randint(1, 100)
|
||||
|
||||
status = write_metric(metric_name, metric_value, labels)
|
||||
print(f"✓ Wrote: {metric_name} = {metric_value}")
|
||||
print(f" Labels: tenant={labels.get('tenant')}, service={labels.get('service')}, env={labels.get('env')}")
|
||||
|
||||
# Wait and query back
|
||||
time.sleep(2)
|
||||
|
||||
print()
|
||||
print("=== Read back test ===")
|
||||
print("=== Read Back Test ===")
|
||||
try:
|
||||
resp = requests.get(query_url, params={"query": metric_name}, timeout=10)
|
||||
resp = requests.get(query_url, params={"query": metric_name}, auth=auth, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
data = result.get("data", {}).get("result", [])
|
||||
@@ -235,7 +226,20 @@ def main():
|
||||
print(f"✗ Query failed: {e}")
|
||||
|
||||
print()
|
||||
print("=== Multi-Tenancy Query Examples ===")
|
||||
print()
|
||||
print("Query by tenant:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{tenant=\"test-tenant\"}}'")
|
||||
print()
|
||||
print("Query by service:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{service=\"api\"}}'")
|
||||
print()
|
||||
print("Query by env:")
|
||||
print(f" curl -u user:pass '{base_url}/api/v1/query?query={{env=\"test\"}}'")
|
||||
print()
|
||||
|
||||
print("=== Test complete ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,15 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Simple M3DB connectivity test
|
||||
# Usage: ./test-metrics.sh <BASE_URL>
|
||||
# Usage: ./test-metrics.sh <BASE_URL> [USERNAME] [PASSWORD]
|
||||
#
|
||||
# Examples:
|
||||
# ./test-metrics.sh https://m3db.vultrlabs.dev example example
|
||||
# ./test-metrics.sh http://192.168.1.100:7201
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
BASE_URL="${1:-}"
|
||||
USERNAME="${2:-}"
|
||||
PASSWORD="${3:-}"
|
||||
|
||||
if [ -z "$BASE_URL" ]; then
|
||||
echo "Usage: $0 <BASE_URL>"
|
||||
echo "Example: $0 https://m3db.vultrlabs.dev:7201"
|
||||
echo "Usage: $0 <BASE_URL> [USERNAME] [PASSWORD]"
|
||||
echo "Example: $0 https://m3db.vultrlabs.dev example example"
|
||||
echo " $0 http://192.168.1.100:7201"
|
||||
exit 1
|
||||
fi
|
||||
@@ -17,13 +24,22 @@ fi
|
||||
# Remove trailing slash if present
|
||||
BASE_URL="${BASE_URL%/}"
|
||||
|
||||
# Build auth flag if credentials provided
|
||||
AUTH_FLAG=""
|
||||
if [ -n "$USERNAME" ] && [ -n "$PASSWORD" ]; then
|
||||
AUTH_FLAG="-u ${USERNAME}:${PASSWORD}"
|
||||
fi
|
||||
|
||||
echo "=== M3DB Connectivity Test ==="
|
||||
echo "Target: ${BASE_URL}"
|
||||
if [ -n "$AUTH_FLAG" ]; then
|
||||
echo "Auth: ${USERNAME}:***"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Health check
|
||||
echo "1. Coordinator Health"
|
||||
if curl -sf "${BASE_URL}/health" > /dev/null 2>&1; then
|
||||
if curl -sf $AUTH_FLAG "${BASE_URL}/health" > /dev/null 2>&1; then
|
||||
echo " ✓ Healthy"
|
||||
else
|
||||
echo " ✗ Unhealthy or unreachable"
|
||||
@@ -33,7 +49,7 @@ fi
|
||||
# Placement
|
||||
echo ""
|
||||
echo "2. Placement (cluster topology)"
|
||||
PLACEMENT=$(curl -sf "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
|
||||
PLACEMENT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/placement" 2>/dev/null || echo '{}')
|
||||
INSTANCE_COUNT=$(echo "$PLACEMENT" | python3 -c "import sys,json; d=json.load(sys.stdin).get('placement',{}).get('instances',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||
if [ "$INSTANCE_COUNT" -gt 0 ]; then
|
||||
echo " ✓ $INSTANCE_COUNT instances in placement"
|
||||
@@ -45,7 +61,7 @@ fi
|
||||
# Namespaces
|
||||
echo ""
|
||||
echo "3. Namespaces (retention policies)"
|
||||
NAMESPACES=$(curl -sf "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
|
||||
NAMESPACES=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/services/m3db/namespace" 2>/dev/null || echo '{}')
|
||||
NS_COUNT=$(echo "$NAMESPACES" | python3 -c "import sys,json; d=json.load(sys.stdin).get('registry',{}).get('namespaces',{}); print(len(d))" 2>/dev/null || echo "0")
|
||||
if [ "$NS_COUNT" -gt 0 ]; then
|
||||
echo " ✓ $NS_COUNT namespaces configured"
|
||||
@@ -57,7 +73,7 @@ fi
|
||||
# Query test
|
||||
echo ""
|
||||
echo "4. Query Test (PromQL)"
|
||||
QUERY_RESULT=$(curl -sf "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
|
||||
QUERY_RESULT=$(curl -sf $AUTH_FLAG "${BASE_URL}/api/v1/query?query=up" 2>/dev/null || echo '{"status":"error"}')
|
||||
STATUS=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||||
if [ "$STATUS" = "success" ]; then
|
||||
RESULT_COUNT=$(echo "$QUERY_RESULT" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('data',{}).get('result',[])))" 2>/dev/null || echo "0")
|
||||
|
||||
Reference in New Issue
Block a user