nvfp4-megamoe-kernel/tests/archive/test_moe_runner_nan_b200.py

#!/usr/bin/env python3
"""
DeepSeek-V4 MoE Runner NaN Test

Tests the Nvfp4MoE (grouped GEMM path) with real weights.
The single-expert tests pass — this test exercises the FULL MoE runner
with routing, padding, grouped GEMM, and combine.

Usage (on B200):
  cd /root/nvfp4-megamoe-kernel
  PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py
"""

import sys, os, json, torch, torch.nn.functional as F
from safetensors import safe_open

REPO = "/root/nvfp4-megamoe-kernel"
sys.path.insert(0, REPO)
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
DEV = "cuda:0"

H = 7168
INTERMEDIATE = 3072
NUM_EXPERTS = 384
TOPK = 6
EPS = 1e-6

_cache = {}
def P(k, wm, md):
    if k in _cache: return _cache[k]
    with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
        t = f.get_tensor(k)
    _cache[k] = t
    return t

def rms(x, w, eps=1e-6):
    v = x.float().pow(2).mean(-1, keepdim=True)
    return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)


def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
    """Pack per-expert weights into stacked format for Nvfp4MoE.
    Only loads the first num_local_experts to fit in memory.
    """
    m = f"model.layers.{layer_id}.mlp"

    # Load expert weights and stack (only first num_local_experts)
    gate_ws, gate_sfs, gate_gss = [], [], []
    up_ws, up_sfs, up_gss = [], [], []
    down_ws, down_sfs, down_gss = [], [], []

    for i in range(num_local_experts):
        e = f"{m}.experts.{i}"
        gate_ws.append(G(f"{e}.gate_proj.weight"))
        gate_sfs.append(G(f"{e}.gate_proj.weight_scale"))
        gate_gs = G(f"{e}.gate_proj.weight_scale_2")
        gate_gss.append(gate_gs)

        up_ws.append(G(f"{e}.up_proj.weight"))
        up_sfs.append(G(f"{e}.up_proj.weight_scale"))
        up_gs = G(f"{e}.up_proj.weight_scale_2")
        up_gss.append(up_gs)

        down_ws.append(G(f"{e}.down_proj.weight"))
        down_sfs.append(G(f"{e}.down_proj.weight_scale"))
        down_gs = G(f"{e}.down_proj.weight_scale_2")
        down_gss.append(down_gs)

        if i % 50 == 0:
            print(f"    Loaded expert {i}/{num_local_experts}")

    # Stack into (E, ...) tensors
    w13_w = torch.stack(gate_ws)  # (E, 3072, 3584)
    w13_sf = torch.stack(gate_sfs)
    w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV)

    # Actually w13 = stacked gate+up, w2 = down
    # But our runner expects separate L1 (gate+up) and L2 (down)
    # The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
    # For Nvfp4MoE, we stack gate and up side-by-side

    # Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
    w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1)  # (E, 6144, 3584)
    w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1)
    w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0)

    w2_w = torch.stack(down_ws)
    w2_sf = torch.stack(down_sfs)
    w2_gs = torch.stack(down_gss)

    return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs


def test_moe_runner(layer_id=2):
    """Test the Nvfp4MoE with real weights."""
    from dsv4.layers.moe import Nvfp4MoE

    torch.cuda.set_device(0)
    torch.manual_seed(42)
    torch.cuda.empty_cache()
    _cache.clear()

    with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
        wm = json.load(f)["weight_map"]
    G = lambda k: P(k, wm, MODEL).to(DEV)

    p = f"model.layers.{layer_id}"
    m = f"{p}.mlp"

    emb = G("model.embed_tokens.weight")
    fnorm = G(f"{p}.post_attention_layernorm.weight")

    print(f"  Packing expert weights (384 experts)...")
    # Test with fewer experts to fit in memory
    num_local_experts = 16  # Use 16 experts (out of 384) for testing

    # Create the runner first, then prepare weights
    intermediate_size = INTERMEDIATE  # 3072
    hidden_size = H  # 7168

    runner = Nvfp4MoE(
        num_experts=num_local_experts,
        hidden_size=hidden_size,
        intermediate_size=intermediate_size,
        max_num_tokens=8192,
        top_k=TOPK,
        device=str(DEV),
    )

    # Load and pack weights
    print(f"  Loading expert weights...")
    w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts)

    print(f"  w13_w: {w13_w.shape}, w2_w: {w2_w.shape}")
    print(f"  w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}")
    print(f"  w13 NaN: {torch.isnan(w13_w.float()).any()}")
    print(f"  w2 NaN: {torch.isnan(w2_w.float()).any()}")

    # Prepare weights for the runner
    l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2)
    l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2)
    l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf
    l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf

    runner.prepare_weights_from_stacked(
        l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(),
        l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(),
    )

    # Test with various token counts
    for num_tokens in [1, 4, 8, 16]:
        token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV)
        hidden = emb[token_ids]
        normed = rms(hidden, fnorm, EPS)

        topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
        topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)

        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")

        with torch.no_grad():
            result = runner.run(normed, topk_weights, topk_ids)

        result_nan = torch.isnan(result).any().item()
        result_amax = result.amax().item() if not result_nan else -1
        print(f"  {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}")

        if result_nan:
            nan_rows = torch.isnan(result).any(dim=1).sum().item()
            print(f"  {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN")

    del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
    torch.cuda.empty_cache()
    _cache.clear()


def main():
    print("=" * 70)
    print("  DeepSeek-V4 MoE Runner NaN Test")
    print("  Tests Nvfp4MoE (grouped GEMM) with real weights")
    print("=" * 70)

    test_moe_runner(layer_id=2)

    print(f"\n{'='*70}")


if __name__ == "__main__":
    main()