nvfp4-megamoe-kernel/tests/production_values_test.py

#!/usr/bin/env python3
"""Production-value tests for DSV4 Pro kernel stack.

ALL tests use Pro config values:
  - 61 layers, 7168 hidden, 128 query heads, HD=512
  - 384 routed experts, top-6, 3072 intermediate
  - HCA ratio=128, CSA ratio=4, CSA top-k=1024
  - 4-way mHC, 20 Sinkhorn iters
  - SWA window=128

This file is the ONLY acceptable place for non-production test values.
If a test needs a smaller value for memory/time, it must be marked
with a comment explaining why and what the production value should be.
"""
import math
import torch
import pytest

# ─── Production Pro config ───────────────────────────────────────────
PRO = dict(
    num_layers=61,
    hidden_size=7168,
    num_query_heads=128,
    head_dim=512,
    rope_dim=64,
    query_compression_dim=1536,
    csa_compression_ratio=4,
    csa_top_k=1024,
    indexer_num_heads=64,
    indexer_head_dim=128,
    hca_compression_ratio=128,
    sliding_window=128,
    num_output_groups=16,
    output_group_dim=1024,
    num_routed_experts=384,
    num_shared_experts=1,
    num_experts_per_tok=6,
    moe_intermediate_size=3072,
    num_hash_routing_layers=3,
    routed_scaling_factor=2.5,
    n_hc=4,
    sinkhorn_iters=20,
    rms_norm_eps=1e-6,
)

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


# ─── 1. FMHA at HD=512, production head counts ──────────────────────

class TestFMHAProduction:
    """FMHA tests at Pro config: HD=512, 128 query heads, various KV lengths."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_fmha_hd512_decode_short(self):
        """Decode (T=1) with 128 Q heads, HD=512, N=128 (1 SWA window)."""
        n_q = PRO["num_query_heads"]
        hd = PRO["head_dim"]
        N = PRO["sliding_window"]
        T = 1
        scale = 1.0 / math.sqrt(hd)

        q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
        k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
        v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)

        # Reference: PyTorch SDPA
        q_4d = q.reshape(1, n_q, T, hd)
        k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
        v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
        ref = torch.nn.functional.scaled_dot_product_attention(
            q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
        ).bfloat16()  # (1, n_q, T, hd)

        from dsv4.layers.attention import _run_production_fmha
        prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "swa", "swa")

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"FMHA HD=512 decode short: cos={cos:.6f}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_fmha_hd512_decode_medium(self):
        """Decode (T=1) with HD=512, N=2048 (compressed tokens after HCA)."""
        n_q = PRO["num_query_heads"]
        hd = PRO["head_dim"]
        N = 2048  # typical compressed KV length after HCA at moderate context
        T = 1
        scale = 1.0 / math.sqrt(hd)

        q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
        k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
        v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)

        q_4d = q.reshape(1, n_q, T, hd)
        k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
        v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
        ref = torch.nn.functional.scaled_dot_product_attention(
            q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
        ).bfloat16()

        from dsv4.layers.attention import _run_production_fmha
        prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca")

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"FMHA HD=512 decode medium: cos={cos:.6f}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_fmha_hd512_decode_long(self):
        """Decode (T=1) with HD=512, N=8192 (compressed tokens at long context)."""
        n_q = PRO["num_query_heads"]
        hd = PRO["head_dim"]
        N = 8192  # compressed KV after HCA at ~1M context (1M/128=7812)
        T = 1
        scale = 1.0 / math.sqrt(hd)

        q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
        k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
        v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)

        q_4d = q.reshape(1, n_q, T, hd)
        k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
        v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
        ref = torch.nn.functional.scaled_dot_product_attention(
            q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
        ).bfloat16()

        from dsv4.layers.attention import _run_production_fmha
        prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca")

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"FMHA HD=512 decode long: cos={cos:.6f}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    @pytest.mark.parametrize("N", [512, 1024, 4096])
    def test_fmha_hd512_csa_topk(self, N):
        """Decode with CSA top-k=1024 selected tokens, HD=512."""
        n_q = PRO["num_query_heads"]
        hd = PRO["head_dim"]
        T = 1
        scale = 1.0 / math.sqrt(hd)

        q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE)
        k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)
        v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE)

        q_4d = q.reshape(1, n_q, T, hd)
        k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd)
        v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N)
        ref = torch.nn.functional.scaled_dot_product_attention(
            q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale
        ).bfloat16()

        from dsv4.layers.attention import _run_production_fmha
        prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "csa", "csa")

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"FMHA HD=512 CSA N={N}: cos={cos:.6f}"


# ─── 2. Compression at production scale ─────────────────────────────

class TestCompressionProduction:
    """CSA and HCA compression at production token counts and ratios."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_csa_compress_production_scale(self):
        """CSA: ratio=4, T=4096 tokens → 1024 compressed, HD=512."""
        hd = PRO["head_dim"]
        m = PRO["csa_compression_ratio"]  # 4
        T = PRO["csa_top_k"] * m  # 4096
        n_blocks = T // m

        kv = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE) * 3.0
        gate = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE)

        # Reference: block-wise softmax + weighted sum
        Ca = kv[:, :hd].reshape(n_blocks, m, hd)
        Cb = kv[:, hd:].reshape(n_blocks, m, hd)
        Ga = gate[:, :hd].reshape(n_blocks, m, hd)
        Gb = gate[:, hd:].reshape(n_blocks, m, hd)

        ref_a = torch.zeros(n_blocks, hd, device=DEVICE)
        ref_b = torch.zeros(n_blocks, hd, device=DEVICE)
        for b in range(n_blocks):
            sa = torch.softmax(Ga[b], dim=0)
            sb = torch.softmax(Gb[b], dim=0)
            ref_a[b] = (sa * Ca[b]).sum(0)
            ref_b[b] = (sb * Cb[b]).sum(0)
        ref = torch.cat([ref_a, ref_b], dim=-1)

        from dsv4.kernels.compressor.production_compress import csa_compress_production
        prod = csa_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m)

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"CSA compress production scale: cos={cos:.6f}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_hca_compress_production_scale(self):
        """HCA: ratio=128, T=16384 tokens → 128 compressed, HD=512.

        This is the 1M context enabler: 1M tokens / 128 = 7812 compressed tokens.
        We test a single HCA block here.
        """
        hd = PRO["head_dim"]
        m = PRO["hca_compression_ratio"]  # 128
        T = m * 128  # 16384 tokens → 128 compressed
        n_blocks = T // m

        kv = torch.randn(T, hd, dtype=torch.float32, device=DEVICE) * 3.0
        gate = torch.randn(T, hd, dtype=torch.float32, device=DEVICE)

        ref = []
        for b in range(n_blocks):
            block_kv = kv[b*m:(b+1)*m]
            block_gate = gate[b*m:(b+1)*m]
            probs = torch.softmax(block_gate, dim=0)
            ref.append((probs * block_kv).sum(0))
        ref = torch.stack(ref)

        from dsv4.kernels.compressor.production_compress import hca_compress_production
        prod = hca_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m)

        cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item()
        assert cos > 0.999, f"HCA compress production scale: cos={cos:.6f}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_hca_compress_1m_context(self):
        """HCA at full 1M context scale: 1M tokens, ratio=128 → 7812 compressed.

        This tests that the kernel handles the full production token count
        without OOM or numerical issues.
        """
        hd = PRO["head_dim"]
        m = PRO["hca_compression_ratio"]  # 128
        T = 1_000_000  # 1M context
        n_blocks = T // m  # 7812

        # Use smaller data to avoid OOM on test — but validate at correct n_blocks
        # The kernel processes blocks independently, so correctness at n_blocks=7812
        # with random data proves the indexing is correct
        kv = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE) * 3.0
        gate = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE)

        from dsv4.kernels.compressor.production_compress import hca_compress_production
        prod = hca_compress_production(kv, gate, None, None, m=m)

        assert prod.shape[0] == n_blocks, f"Expected {n_blocks} compressed, got {prod.shape[0]}"
        assert prod.shape[1] == hd, f"Expected hd={hd}, got {prod.shape[1]}"
        assert torch.isfinite(prod).all(), "HCA compress 1M: NaN/Inf in output"


# ─── 3. NVFP4 GEMM at production weight shapes ─────────────────────

class TestNVFP4GEMMProduction:
    """Test NVFP4 linear layers at Pro model weight shapes."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    @pytest.mark.parametrize("name,in_dim,out_dim", [
        ("q_a_proj", 7168, 1536),       # hidden → query compression
        ("kv_proj", 7168, 2*512),        # hidden → KV (1 KV head for GQA)
        ("wo_a_proj", 16*1024, 7168),    # output groups → hidden
        ("gate_proj", 7168, 3072*384),   # MoE gate: hidden → 384 experts (for dense router)
    ])
    def test_nvfp4_linear_production_shapes(self, name, in_dim, out_dim):
        """Test Nvfp4Linear at actual Pro model weight dimensions."""
        from dsv4.layers.linear import Nvfp4Linear

        # kv_proj in GQA has fewer heads — the actual out_dim varies per layer
        # but the kernel must handle all shapes
        lin = Nvfp4Linear(in_dim, out_dim, max_num_tokens=8192, device=DEVICE)

        x = torch.randn(1, in_dim, dtype=torch.bfloat16, device=DEVICE) * 2.0
        out = lin(x)
        assert out.shape == (1, out_dim), f"Expected (1, {out_dim}), got {out.shape}"
        assert torch.isfinite(out).all(), f"NaN/Inf in {name} output"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_nvfp4_moe_384_experts(self):
        """Test Nvfp4MoE with 384 routed experts, top-6, 3072 intermediate."""
        from dsv4.layers.ffn import Nvfp4MoE

        H = PRO["hidden_size"]
        E = PRO["num_routed_experts"]
        K = PRO["num_experts_per_tok"]
        I = PRO["moe_intermediate_size"]

        moe = Nvfp4MoE(num_experts=E, hidden_size=H, intermediate_size=I, top_k=K, device=DEVICE)

        x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0
        topk_ids = torch.randint(0, E, (1, K), device=DEVICE, dtype=torch.int32)
        topk_weights = torch.softmax(torch.randn(1, K, device=DEVICE), dim=-1)

        out = moe.run(x, topk_ids, topk_weights)
        assert out.shape == (1, H), f"Expected (1, {H}), got {out.shape}"
        assert torch.isfinite(out).all(), "NaN/Inf in MoE output"


# ─── 4. mHC at production depth ─────────────────────────────────────

class TestMHCProduction:
    """Test multi-head hyper-connection with 4 streams, 61 layers, Sinkhorn."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_mhc_61_layers_residual_bounded(self):
        """Run mHC through 61 layers and verify residual stays bounded.

        Production mHC should keep |X| bounded. If it grows unbounded,
        the Sinkhorn normalization is wrong.
        """
        from dsv4.layers.mhc import mHCLayer

        H = PRO["hidden_size"]
        n_hc = PRO["n_hc"]
        n_layers = PRO["num_layers"]
        eps = PRO["rms_norm_eps"]

        # Simulate 61 layers of mHC with random weights
        x = torch.randn(n_hc, H, dtype=torch.bfloat16, device=DEVICE) * 0.5
        residual_norms = [x.abs().max().item()]

        for li in range(n_layers):
            layer = mHCLayer(H, n_hc, device=DEVICE)
            # Fake sub-layer output
            sub_out = torch.randn(H, dtype=torch.bfloat16, device=DEVICE) * 0.5
            x = layer(sub_out, x)
            max_val = x.abs().max().item()
            residual_norms.append(max_val)

        # mHC with proper Sinkhorn should keep residuals bounded
        # Allow generous bound (1000) but flag if growing monotonically
        final_norm = residual_norms[-1]
        max_norm = max(residual_norms)

        print(f"Residual norms: L0={residual_norms[0]:.1f} ... L61={final_norm:.1f} max={max_norm:.1f}")

        # The residual should NOT grow by >100x from input
        growth = max_norm / (residual_norms[0] + 1e-6)
        assert growth < 100, f"mHC residual grew {growth:.1f}x over 61 layers — Sinkhorn broken?"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_mhc_sinkhorn_doubly_stochastic(self):
        """Verify Sinkhorn produces doubly-stochastic matrices at production scale."""
        n_hc = PRO["n_hc"]
        iters = PRO["sinkhorn_iters"]
        B = 16  # Production batch dimension

        comb = torch.randn(B, n_hc, n_hc, dtype=torch.bfloat16, device=DEVICE) * 2.0

        # Sinkhorn: softmax → alternate row/col norm
        P = torch.softmax(comb.float(), dim=-1) + 1e-6
        for _ in range(iters):
            P = P / P.sum(dim=-1, keepdim=True)  # row norm
            P = P / P.sum(dim=-2, keepdim=True)  # col norm

        row_sums = P.sum(dim=-1)
        col_sums = P.sum(dim=-2)

        assert torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-2), \
            f"Row sums not ~1.0: {row_sums.mean().item():.4f}"
        assert torch.allclose(col_sums, torch.ones_like(col_sums), atol=1e-2), \
            f"Col sums not ~1.0: {col_sums.mean().item():.4f}"


# ─── 5. Router at production scale ──────────────────────────────────

class TestRouterProduction:
    """Test router with 384 experts, hash routing for L0-2, noaux_tc for L3+."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_hash_router_384_experts(self):
        """Hash routing (layers 0-2) with 384 experts, top-6."""
        from dsv4.layers.router import HashRouter

        E = PRO["num_routed_experts"]
        K = PRO["num_experts_per_tok"]
        H = PRO["hidden_size"]

        router = HashRouter(num_experts=E, top_k=K, hidden_size=H, device=DEVICE)
        token_ids = torch.tensor([1, 50, 100, 500, 9999, 50000], dtype=torch.int32, device=DEVICE)
        x = torch.randn(len(token_ids), H, dtype=torch.bfloat16, device=DEVICE) * 2.0

        topk_ids, topk_weights = router(x, token_ids)
        assert topk_ids.shape == (len(token_ids), K)
        assert (topk_ids >= 0).all() and (topk_ids < E).all(), \
            f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_noaux_tc_router_384_experts(self):
        """Noaux-TC routing (layers 3+) with 384 experts, top-6."""
        from dsv4.layers.router import Router

        E = PRO["num_routed_experts"]
        K = PRO["num_experts_per_tok"]
        H = PRO["hidden_size"]

        router = Router(hidden_size=H, num_experts=E, top_k=K, device=DEVICE, is_hash=False)
        x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0

        topk_ids, topk_weights = router.run(x)
        assert topk_ids.shape == (1, K)
        assert (topk_ids >= 0).all() and (topk_ids < E).all(), \
            f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}"


# ─── 6. Memory budget at production scale ───────────────────────────

class TestMemoryBudget:
    """Verify memory usage stays within bounds for 1M context."""

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_kv_pool_memory_1m_context(self):
        """Calculate and validate KV pool memory at 1M context.

        At 1M tokens with HCA ratio=128:
        - HCA compressed: 1M / 128 = 7812 tokens × HD=512 × 2 (K+V) × 2 bytes
        - SWA window: 128 tokens × HD=512 × 2 × 2 bytes
        - CSA top-k: 1024 tokens × HD=512 × 2 × 2 bytes

        Total per layer per batch ≈ (7812 + 128 + 1024) × 512 × 2 × 2 ≈ 18.4 MB
        × 61 layers = 1.1 GB per batch — feasible on B200 192GB
        """
        hca_compressed = 1_000_000 // PRO["hca_compression_ratio"]  # 7812
        swa_tokens = PRO["sliding_window"]  # 128
        csa_tokens = PRO["csa_top_k"]  # 1024
        hd = PRO["head_dim"]
        bytes_per_val = 2  # BF16

        total_tokens = hca_compressed + swa_tokens + csa_tokens
        bytes_per_layer = total_tokens * hd * 2 * bytes_per_val  # K+V
        total_bytes = bytes_per_layer * PRO["num_layers"]
        total_gb = total_bytes / 1e9

        # Without compression: 1M × 512 × 2 × 2 × 61 = 125 GB — IMPOSSIBLE
        uncompressed_gb = (1_000_000 * hd * 2 * bytes_per_val * PRO["num_layers"]) / 1e9

        print(f"Compressed KV pool: {total_gb:.2f} GB")
        print(f"Uncompressed KV pool: {uncompressed_gb:.2f} GB")
        print(f"Compression saves: {uncompressed_gb - total_gb:.2f} GB ({(1 - total_gb/uncompressed_gb)*100:.1f}%)")

        # Verify compression achieves the claimed ratio
        assert total_gb < 5.0, f"Compressed KV too large: {total_gb:.2f} GB — compression broken?"
        assert total_gb < uncompressed_gb * 0.02, "Compression ratio worse than expected"

    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU")
    def test_weight_memory_8gpu(self):
        """Validate weight distribution across 8 GPUs at Pro scale.

        Pro model weight memory (NVFP4):
        - 61 layers × (attention + MoE + shared expert + mHC + norms)
        - NVFP4: 2 bits per param → ~0.25 bytes per param
        - Total params: ~1.8T → ~450 GB in NVFP4
        - Across 8 GPUs: ~56 GB per GPU — fits in B200 192GB HBM
        """
        # Rough estimate: Pro has ~1.8T params (384 experts × 7168 × 3072 × 2 × 61 layers)
        expert_params = PRO["num_routed_experts"] * PRO["hidden_size"] * PRO["moe_intermediate_size"] * 2  # gate+up
        expert_params += PRO["num_routed_experts"] * PRO["moe_intermediate_size"] * PRO["hidden_size"]  # down
        shared_params = PRO["hidden_size"] * PRO["moe_intermediate_size"] * 3  # gate+up+down
        attn_params = PRO["hidden_size"] * (PRO["query_compression_dim"] + 2 * PRO["head_dim"] + PRO["num_output_groups"] * PRO["output_group_dim"])
        mhc_params = PRO["n_hc"] * PRO["n_hc"] * 3 + PRO["n_hc"] * 2  # comb + pre + post

        total_params = (expert_params + shared_params + attn_params + mhc_params) * PRO["num_layers"]
        total_params += PRO["hidden_size"] * PRO["vocab_size"]  # embedding + lm_head

        nvfp4_bytes = total_params / 4  # 2 bits per param
        per_gpu_bytes = nvfp4_bytes / 8
        per_gpu_gb = per_gpu_bytes / 1e9

        print(f"Total params: {total_params/1e12:.2f}T")
        print(f"NVFP4 weight memory: {nvfp4_bytes/1e9:.2f} GB total, {per_gpu_gb:.2f} GB per GPU")

        assert per_gpu_gb < 100, f"Per-GPU weight memory too large: {per_gpu_gb:.2f} GB"


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])