diff --git a/tests/test_moe_runner_nan_b200.py b/tests/test_moe_runner_nan_b200.py new file mode 100644 index 00000000..97e7ebe4 --- /dev/null +++ b/tests/test_moe_runner_nan_b200.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +DeepSeek-V4 MoE Runner NaN Test + +Tests the CuTeDSLMoERunner (grouped GEMM path) with real weights. +The single-expert tests pass — this test exercises the FULL MoE runner +with routing, padding, grouped GEMM, and combine. + +Usage (on B200): + cd /root/nvfp4-megamoe-kernel + PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py +""" + +import sys, os, json, torch, torch.nn.functional as F +from safetensors import safe_open + +REPO = "/root/nvfp4-megamoe-kernel" +sys.path.insert(0, REPO) +MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" +DEV = "cuda:0" + +H = 7168 +INTERMEDIATE = 3072 +NUM_EXPERTS = 384 +TOPK = 6 +EPS = 1e-6 + +_cache = {} +def P(k, wm, md): + if k in _cache: return _cache[k] + with safe_open(os.path.join(md, wm[k]), framework="pt") as f: + t = f.get_tensor(k) + _cache[k] = t + return t + +def rms(x, w, eps=1e-6): + v = x.float().pow(2).mean(-1, keepdim=True) + return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) + + +def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384): + """Pack per-expert weights into stacked format for CuTeDSLMoERunner.""" + m = f"model.layers.{layer_id}.mlp" + + # Load all expert weights and stack + gate_ws, gate_sfs, gate_gss = [], [], [] + up_ws, up_sfs, up_gss = [], [], [] + down_ws, down_sfs, down_gss = [], [], [] + + for i in range(num_local_experts): + e = f"{m}.experts.{i}" + gate_ws.append(G(f"{e}.gate_proj.weight")) + gate_sfs.append(G(f"{e}.gate_proj.weight_scale")) + gate_gs = G(f"{e}.gate_proj.weight_scale_2") + gate_gss.append(gate_gs) + + up_ws.append(G(f"{e}.up_proj.weight")) + up_sfs.append(G(f"{e}.up_proj.weight_scale")) + up_gs = G(f"{e}.up_proj.weight_scale_2") + up_gss.append(up_gs) + + down_ws.append(G(f"{e}.down_proj.weight")) + down_sfs.append(G(f"{e}.down_proj.weight_scale")) + down_gs = G(f"{e}.down_proj.weight_scale_2") + down_gss.append(down_gs) + + if i % 50 == 0: + print(f" Loaded expert {i}/{num_local_experts}") + + # Stack into (E, ...) tensors + w13_w = torch.stack(gate_ws) # (E, 3072, 3584) + w13_sf = torch.stack(gate_sfs) + w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV) + + # Actually w13 = stacked gate+up, w2 = down + # But our runner expects separate L1 (gate+up) and L2 (down) + # The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved + # For CuTeDSLMoERunner, we stack gate and up side-by-side + + # Stack gate and up into w13 format: (E, 2*intermediate, hidden//2) + w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1) # (E, 6144, 3584) + w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1) + w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0) + + w2_w = torch.stack(down_ws) + w2_sf = torch.stack(down_sfs) + w2_gs = torch.stack(down_gss) + + return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs + + +def test_moe_runner(layer_id=2): + """Test the CuTeDSLMoERunner with real weights.""" + from cutedsl.runner import CuTeDSLMoERunner + + torch.cuda.set_device(0) + torch.manual_seed(42) + torch.cuda.empty_cache() + _cache.clear() + + with open(os.path.join(MODEL, "model.safetensors.index.json")) as f: + wm = json.load(f)["weight_map"] + G = lambda k: P(k, wm, MODEL).to(DEV) + + p = f"model.layers.{layer_id}" + m = f"{p}.mlp" + + emb = G("model.embed_tokens.weight") + fnorm = G(f"{p}.post_attention_layernorm.weight") + + print(f" Packing expert weights (384 experts)...") + # This will take a while and use a LOT of memory + # Let's use fewer experts for testing + num_local_experts = 384 + + # Create the runner first, then prepare weights + intermediate_size = INTERMEDIATE # 3072 + hidden_size = H # 7168 + + runner = CuTeDSLMoERunner( + num_experts=num_local_experts, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + max_num_tokens=8192, + top_k=TOPK, + device=str(DEV), + ) + + # Load and pack weights + print(f" Loading expert weights...") + w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts) + + print(f" w13_w: {w13_w.shape}, w2_w: {w2_w.shape}") + print(f" w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}") + print(f" w13 NaN: {torch.isnan(w13_w.float()).any()}") + print(f" w2 NaN: {torch.isnan(w2_w.float()).any()}") + + # Prepare weights for the runner + l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2) + l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2) + l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf + l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf + + runner.prepare_weights_from_stacked( + l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(), + l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(), + ) + + # Test with various token counts + for num_tokens in [1, 4, 8, 16]: + token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV) + hidden = emb[token_ids] + normed = rms(hidden, fnorm, EPS) + + topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV) + topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1) + + print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}") + + with torch.no_grad(): + result = runner.run(normed, topk_weights, topk_ids) + + result_nan = torch.isnan(result).any().item() + result_amax = result.amax().item() if not result_nan else -1 + print(f" {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}") + + if result_nan: + nan_rows = torch.isnan(result).any(dim=1).sum().item() + print(f" {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN") + + del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs + torch.cuda.empty_cache() + _cache.clear() + + +def main(): + print("=" * 70) + print(" DeepSeek-V4 MoE Runner NaN Test") + print(" Tests CuTeDSLMoERunner (grouped GEMM) with real weights") + print("=" * 70) + + test_moe_runner(layer_id=2) + + print(f"\n{'='*70}") + + +if __name__ == "__main__": + main()