Files
nvfp4-megamoe-kernel/tests/production_values_test.py
biondizzle f52eedbdce Add production-value tests: ALL tests use Pro config (61L, HD=512, 384 experts, HCA=128, 1M context)
Previous unit tests used toy values (HD=64-256, T=16, small N).
These tests validate the actual production configuration:
- FMHA: HD=512, 128 Q heads, N=128/2048/8192
- Compression: CSA T=4096, HCA T=16384, full 1M context
- NVFP4: production weight shapes (q_a, kv, wo_a, gate)
- MoE: 384 experts, top-6, 3072 intermediate
- mHC: 4 streams, 61 layers, residual bounded, doubly-stochastic
- Router: 384 experts hash + noaux-TC
- Memory budget: 1M context KV pool, 8-GPU weight distribution
2026-06-02 04:10:39 +00:00

476 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Production-value tests for DSV4 Pro kernel stack.
ALL tests use Pro config values:
- 61 layers, 7168 hidden, 128 query heads, HD=512
- 384 routed experts, top-6, 3072 intermediate
- HCA ratio=128, CSA ratio=4, CSA top-k=1024
- 4-way mHC, 20 Sinkhorn iters
- SWA window=128
This file is the ONLY acceptable place for non-production test values.
If a test needs a smaller value for memory/time, it must be marked
with a comment explaining why and what the production value should be.
"""
import math
import torch
import pytest
# ─── Production Pro config ───────────────────────────────────────────
PRO = dict(
num_layers=61,
hidden_size=7168,
num_query_heads=128,
head_dim=512,
rope_dim=64,
query_compression_dim=1536,
csa_compression_ratio=4,
csa_top_k=1024,
indexer_num_heads=64,
indexer_head_dim=128,
hca_compression_ratio=128,
sliding_window=128,
num_output_groups=16,
output_group_dim=1024,
num_routed_experts=384,
num_shared_experts=1,
num_experts_per_tok=6,
moe_intermediate_size=3072,
num_hash_routing_layers=3,
routed_scaling_factor=2.5,
n_hc=4,
sinkhorn_iters=20,
rms_norm_eps=1e-6,
)
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
# ─── 1. FMHA at HD=512, production head counts ──────────────────────
class TestFMHAProduction:
"""FMHA tests at Pro config: HD=512, 128 query heads, various KV lengths."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_fmha_hd512_decode_short(self):
"""Decode (T=1) with 128 Q heads, HD=512, N=128 (1 SWA window)."""
n_q = PRO["num_query_heads"]
hd = PRO["head_dim"]
N = PRO["sliding_window"]
T = 1
scale = 1.0 / math.sqrt(hd)
q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
# Reference: PyTorch SDPA
q_4d = q.reshape(1, n_q, T, hd)
k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
ref = torch.nn.functional.scaled_dot_product_attention(
q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
).bfloat16() # (1, n_q, T, hd)
from dsv4.layers.attention import _run_production_fmha
prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "swa", "swa")
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"FMHA HD=512 decode short: cos={cos:.6f}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_fmha_hd512_decode_medium(self):
"""Decode (T=1) with HD=512, N=2048 (compressed tokens after HCA)."""
n_q = PRO["num_query_heads"]
hd = PRO["head_dim"]
N = 2048 # typical compressed KV length after HCA at moderate context
T = 1
scale = 1.0 / math.sqrt(hd)
q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
q_4d = q.reshape(1, n_q, T, hd)
k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
ref = torch.nn.functional.scaled_dot_product_attention(
q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
).bfloat16()
from dsv4.layers.attention import _run_production_fmha
prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca")
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"FMHA HD=512 decode medium: cos={cos:.6f}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_fmha_hd512_decode_long(self):
"""Decode (T=1) with HD=512, N=8192 (compressed tokens at long context)."""
n_q = PRO["num_query_heads"]
hd = PRO["head_dim"]
N = 8192 # compressed KV after HCA at ~1M context (1M/128=7812)
T = 1
scale = 1.0 / math.sqrt(hd)
q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
q_4d = q.reshape(1, n_q, T, hd)
k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
ref = torch.nn.functional.scaled_dot_product_attention(
q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
).bfloat16()
from dsv4.layers.attention import _run_production_fmha
prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca")
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"FMHA HD=512 decode long: cos={cos:.6f}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
@pytest.mark.parametrize("N", [512, 1024, 4096])
def test_fmha_hd512_csa_topk(self, N):
"""Decode with CSA top-k=1024 selected tokens, HD=512."""
n_q = PRO["num_query_heads"]
hd = PRO["head_dim"]
T = 1
scale = 1.0 / math.sqrt(hd)
q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
q_4d = q.reshape(1, n_q, T, hd)
k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
ref = torch.nn.functional.scaled_dot_product_attention(
q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
).bfloat16()
from dsv4.layers.attention import _run_production_fmha
prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "csa", "csa")
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"FMHA HD=512 CSA N={N}: cos={cos:.6f}"
# ─── 2. Compression at production scale ─────────────────────────────
class TestCompressionProduction:
"""CSA and HCA compression at production token counts and ratios."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_csa_compress_production_scale(self):
"""CSA: ratio=4, T=4096 tokens → 1024 compressed, HD=512."""
hd = PRO["head_dim"]
m = PRO["csa_compression_ratio"] # 4
T = PRO["csa_top_k"] * m # 4096
n_blocks = T // m
kv = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE) * 3.0
gate = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE)
# Reference: block-wise softmax + weighted sum
Ca = kv[:, :hd].reshape(n_blocks, m, hd)
Cb = kv[:, hd:].reshape(n_blocks, m, hd)
Ga = gate[:, :hd].reshape(n_blocks, m, hd)
Gb = gate[:, hd:].reshape(n_blocks, m, hd)
ref_a = torch.zeros(n_blocks, hd, device=DEVICE)
ref_b = torch.zeros(n_blocks, hd, device=DEVICE)
for b in range(n_blocks):
sa = torch.softmax(Ga[b], dim=0)
sb = torch.softmax(Gb[b], dim=0)
ref_a[b] = (sa * Ca[b]).sum(0)
ref_b[b] = (sb * Cb[b]).sum(0)
ref = torch.cat([ref_a, ref_b], dim=-1)
from dsv4.kernels.compressor.production_compress import csa_compress_production
prod = csa_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m)
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"CSA compress production scale: cos={cos:.6f}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_hca_compress_production_scale(self):
"""HCA: ratio=128, T=16384 tokens → 128 compressed, HD=512.
This is the 1M context enabler: 1M tokens / 128 = 7812 compressed tokens.
We test a single HCA block here.
"""
hd = PRO["head_dim"]
m = PRO["hca_compression_ratio"] # 128
T = m * 128 # 16384 tokens → 128 compressed
n_blocks = T // m
kv = torch.randn(T, hd, dtype=torch.float32, device=DEVICE) * 3.0
gate = torch.randn(T, hd, dtype=torch.float32, device=DEVICE)
ref = []
for b in range(n_blocks):
block_kv = kv[b*m:(b+1)*m]
block_gate = gate[b*m:(b+1)*m]
probs = torch.softmax(block_gate, dim=0)
ref.append((probs * block_kv).sum(0))
ref = torch.stack(ref)
from dsv4.kernels.compressor.production_compress import hca_compress_production
prod = hca_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m)
cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
assert cos > 0.999, f"HCA compress production scale: cos={cos:.6f}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_hca_compress_1m_context(self):
"""HCA at full 1M context scale: 1M tokens, ratio=128 → 7812 compressed.
This tests that the kernel handles the full production token count
without OOM or numerical issues.
"""
hd = PRO["head_dim"]
m = PRO["hca_compression_ratio"] # 128
T = 1_000_000 # 1M context
n_blocks = T // m # 7812
# Use smaller data to avoid OOM on test — but validate at correct n_blocks
# The kernel processes blocks independently, so correctness at n_blocks=7812
# with random data proves the indexing is correct
kv = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE) * 3.0
gate = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE)
from dsv4.kernels.compressor.production_compress import hca_compress_production
prod = hca_compress_production(kv, gate, None, None, m=m)
assert prod.shape[0] == n_blocks, f"Expected {n_blocks} compressed, got {prod.shape[0]}"
assert prod.shape[1] == hd, f"Expected hd={hd}, got {prod.shape[1]}"
assert torch.isfinite(prod).all(), "HCA compress 1M: NaN/Inf in output"
# ─── 3. NVFP4 GEMM at production weight shapes ─────────────────────
class TestNVFP4GEMMProduction:
"""Test NVFP4 linear layers at Pro model weight shapes."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
@pytest.mark.parametrize("name,in_dim,out_dim", [
("q_a_proj", 7168, 1536), # hidden → query compression
("kv_proj", 7168, 2*512), # hidden → KV (1 KV head for GQA)
("wo_a_proj", 16*1024, 7168), # output groups → hidden
("gate_proj", 7168, 3072*384), # MoE gate: hidden → 384 experts (for dense router)
])
def test_nvfp4_linear_production_shapes(self, name, in_dim, out_dim):
"""Test Nvfp4Linear at actual Pro model weight dimensions."""
from dsv4.layers.linear import Nvfp4Linear
# kv_proj in GQA has fewer heads — the actual out_dim varies per layer
# but the kernel must handle all shapes
lin = Nvfp4Linear(in_dim, out_dim, max_num_tokens=8192, device=DEVICE)
x = torch.randn(1, in_dim, dtype=torch.bfloat16, device=DEVICE) * 2.0
out = lin(x)
assert out.shape == (1, out_dim), f"Expected (1, {out_dim}), got {out.shape}"
assert torch.isfinite(out).all(), f"NaN/Inf in {name} output"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_nvfp4_moe_384_experts(self):
"""Test Nvfp4MoE with 384 routed experts, top-6, 3072 intermediate."""
from dsv4.layers.ffn import Nvfp4MoE
H = PRO["hidden_size"]
E = PRO["num_routed_experts"]
K = PRO["num_experts_per_tok"]
I = PRO["moe_intermediate_size"]
moe = Nvfp4MoE(num_experts=E, hidden_size=H, intermediate_size=I, top_k=K, device=DEVICE)
x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0
topk_ids = torch.randint(0, E, (1, K), device=DEVICE, dtype=torch.int32)
topk_weights = torch.softmax(torch.randn(1, K, device=DEVICE), dim=-1)
out = moe.run(x, topk_ids, topk_weights)
assert out.shape == (1, H), f"Expected (1, {H}), got {out.shape}"
assert torch.isfinite(out).all(), "NaN/Inf in MoE output"
# ─── 4. mHC at production depth ─────────────────────────────────────
class TestMHCProduction:
"""Test multi-head hyper-connection with 4 streams, 61 layers, Sinkhorn."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_mhc_61_layers_residual_bounded(self):
"""Run mHC through 61 layers and verify residual stays bounded.
Production mHC should keep |X| bounded. If it grows unbounded,
the Sinkhorn normalization is wrong.
"""
from dsv4.layers.mhc import mHCLayer
H = PRO["hidden_size"]
n_hc = PRO["n_hc"]
n_layers = PRO["num_layers"]
eps = PRO["rms_norm_eps"]
# Simulate 61 layers of mHC with random weights
x = torch.randn(n_hc, H, dtype=torch.bfloat16, device=DEVICE) * 0.5
residual_norms = [x.abs().max().item()]
for li in range(n_layers):
layer = mHCLayer(H, n_hc, device=DEVICE)
# Fake sub-layer output
sub_out = torch.randn(H, dtype=torch.bfloat16, device=DEVICE) * 0.5
x = layer(sub_out, x)
max_val = x.abs().max().item()
residual_norms.append(max_val)
# mHC with proper Sinkhorn should keep residuals bounded
# Allow generous bound (1000) but flag if growing monotonically
final_norm = residual_norms[-1]
max_norm = max(residual_norms)
print(f"Residual norms: L0={residual_norms[0]:.1f} ... L61={final_norm:.1f} max={max_norm:.1f}")
# The residual should NOT grow by >100x from input
growth = max_norm / (residual_norms[0] + 1e-6)
assert growth < 100, f"mHC residual grew {growth:.1f}x over 61 layers — Sinkhorn broken?"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_mhc_sinkhorn_doubly_stochastic(self):
"""Verify Sinkhorn produces doubly-stochastic matrices at production scale."""
n_hc = PRO["n_hc"]
iters = PRO["sinkhorn_iters"]
B = 16 # Production batch dimension
comb = torch.randn(B, n_hc, n_hc, dtype=torch.bfloat16, device=DEVICE) * 2.0
# Sinkhorn: softmax → alternate row/col norm
P = torch.softmax(comb.float(), dim=-1) + 1e-6
for _ in range(iters):
P = P / P.sum(dim=-1, keepdim=True) # row norm
P = P / P.sum(dim=-2, keepdim=True) # col norm
row_sums = P.sum(dim=-1)
col_sums = P.sum(dim=-2)
assert torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-2), \
f"Row sums not ~1.0: {row_sums.mean().item():.4f}"
assert torch.allclose(col_sums, torch.ones_like(col_sums), atol=1e-2), \
f"Col sums not ~1.0: {col_sums.mean().item():.4f}"
# ─── 5. Router at production scale ──────────────────────────────────
class TestRouterProduction:
"""Test router with 384 experts, hash routing for L0-2, noaux_tc for L3+."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_hash_router_384_experts(self):
"""Hash routing (layers 0-2) with 384 experts, top-6."""
from dsv4.layers.router import HashRouter
E = PRO["num_routed_experts"]
K = PRO["num_experts_per_tok"]
H = PRO["hidden_size"]
router = HashRouter(num_experts=E, top_k=K, hidden_size=H, device=DEVICE)
token_ids = torch.tensor([1, 50, 100, 500, 9999, 50000], dtype=torch.int32, device=DEVICE)
x = torch.randn(len(token_ids), H, dtype=torch.bfloat16, device=DEVICE) * 2.0
topk_ids, topk_weights = router(x, token_ids)
assert topk_ids.shape == (len(token_ids), K)
assert (topk_ids >= 0).all() and (topk_ids < E).all(), \
f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_noaux_tc_router_384_experts(self):
"""Noaux-TC routing (layers 3+) with 384 experts, top-6."""
from dsv4.layers.router import Router
E = PRO["num_routed_experts"]
K = PRO["num_experts_per_tok"]
H = PRO["hidden_size"]
router = Router(hidden_size=H, num_experts=E, top_k=K, device=DEVICE, is_hash=False)
x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0
topk_ids, topk_weights = router.run(x)
assert topk_ids.shape == (1, K)
assert (topk_ids >= 0).all() and (topk_ids < E).all(), \
f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}"
# ─── 6. Memory budget at production scale ───────────────────────────
class TestMemoryBudget:
"""Verify memory usage stays within bounds for 1M context."""
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_kv_pool_memory_1m_context(self):
"""Calculate and validate KV pool memory at 1M context.
At 1M tokens with HCA ratio=128:
- HCA compressed: 1M / 128 = 7812 tokens × HD=512 × 2 (K+V) × 2 bytes
- SWA window: 128 tokens × HD=512 × 2 × 2 bytes
- CSA top-k: 1024 tokens × HD=512 × 2 × 2 bytes
Total per layer per batch ≈ (7812 + 128 + 1024) × 512 × 2 × 2 ≈ 18.4 MB
× 61 layers = 1.1 GB per batch — feasible on B200 192GB
"""
hca_compressed = 1_000_000 // PRO["hca_compression_ratio"] # 7812
swa_tokens = PRO["sliding_window"] # 128
csa_tokens = PRO["csa_top_k"] # 1024
hd = PRO["head_dim"]
bytes_per_val = 2 # BF16
total_tokens = hca_compressed + swa_tokens + csa_tokens
bytes_per_layer = total_tokens * hd * 2 * bytes_per_val # K+V
total_bytes = bytes_per_layer * PRO["num_layers"]
total_gb = total_bytes / 1e9
# Without compression: 1M × 512 × 2 × 2 × 61 = 125 GB — IMPOSSIBLE
uncompressed_gb = (1_000_000 * hd * 2 * bytes_per_val * PRO["num_layers"]) / 1e9
print(f"Compressed KV pool: {total_gb:.2f} GB")
print(f"Uncompressed KV pool: {uncompressed_gb:.2f} GB")
print(f"Compression saves: {uncompressed_gb - total_gb:.2f} GB ({(1 - total_gb/uncompressed_gb)*100:.1f}%)")
# Verify compression achieves the claimed ratio
assert total_gb < 5.0, f"Compressed KV too large: {total_gb:.2f} GB — compression broken?"
assert total_gb < uncompressed_gb * 0.02, "Compression ratio worse than expected"
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
def test_weight_memory_8gpu(self):
"""Validate weight distribution across 8 GPUs at Pro scale.
Pro model weight memory (NVFP4):
- 61 layers × (attention + MoE + shared expert + mHC + norms)
- NVFP4: 2 bits per param → ~0.25 bytes per param
- Total params: ~1.8T → ~450 GB in NVFP4
- Across 8 GPUs: ~56 GB per GPU — fits in B200 192GB HBM
"""
# Rough estimate: Pro has ~1.8T params (384 experts × 7168 × 3072 × 2 × 61 layers)
expert_params = PRO["num_routed_experts"] * PRO["hidden_size"] * PRO["moe_intermediate_size"] * 2 # gate+up
expert_params += PRO["num_routed_experts"] * PRO["moe_intermediate_size"] * PRO["hidden_size"] # down
shared_params = PRO["hidden_size"] * PRO["moe_intermediate_size"] * 3 # gate+up+down
attn_params = PRO["hidden_size"] * (PRO["query_compression_dim"] + 2 * PRO["head_dim"] + PRO["num_output_groups"] * PRO["output_group_dim"])
mhc_params = PRO["n_hc"] * PRO["n_hc"] * 3 + PRO["n_hc"] * 2 # comb + pre + post
total_params = (expert_params + shared_params + attn_params + mhc_params) * PRO["num_layers"]
total_params += PRO["hidden_size"] * PRO["vocab_size"] # embedding + lm_head
nvfp4_bytes = total_params / 4 # 2 bits per param
per_gpu_bytes = nvfp4_bytes / 8
per_gpu_gb = per_gpu_bytes / 1e9
print(f"Total params: {total_params/1e12:.2f}T")
print(f"NVFP4 weight memory: {nvfp4_bytes/1e9:.2f} GB total, {per_gpu_gb:.2f} GB per GPU")
assert per_gpu_gb < 100, f"Per-GPU weight memory too large: {per_gpu_gb:.2f} GB"
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])