diff --git a/tests/test_moe_runner_nan_b200.py b/tests/test_moe_runner_nan_b200.py
new file mode 100644
index 00000000..97e7ebe4
--- /dev/null
+++ b/tests/test_moe_runner_nan_b200.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+DeepSeek-V4 MoE Runner NaN Test
+
+Tests the CuTeDSLMoERunner (grouped GEMM path) with real weights.
+The single-expert tests pass — this test exercises the FULL MoE runner
+with routing, padding, grouped GEMM, and combine.
+
+Usage (on B200):
+  cd /root/nvfp4-megamoe-kernel
+  PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py
+"""
+
+import sys, os, json, torch, torch.nn.functional as F
+from safetensors import safe_open
+
+REPO = "/root/nvfp4-megamoe-kernel"
+sys.path.insert(0, REPO)
+MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
+DEV = "cuda:0"
+
+H = 7168
+INTERMEDIATE = 3072
+NUM_EXPERTS = 384
+TOPK = 6
+EPS = 1e-6
+
+_cache = {}
+def P(k, wm, md):
+    if k in _cache: return _cache[k]
+    with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
+        t = f.get_tensor(k)
+    _cache[k] = t
+    return t
+
+def rms(x, w, eps=1e-6):
+    v = x.float().pow(2).mean(-1, keepdim=True)
+    return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
+
+
+def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
+    """Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
+    m = f"model.layers.{layer_id}.mlp"
+    
+    # Load all expert weights and stack
+    gate_ws, gate_sfs, gate_gss = [], [], []
+    up_ws, up_sfs, up_gss = [], [], []
+    down_ws, down_sfs, down_gss = [], [], []
+    
+    for i in range(num_local_experts):
+        e = f"{m}.experts.{i}"
+        gate_ws.append(G(f"{e}.gate_proj.weight"))
+        gate_sfs.append(G(f"{e}.gate_proj.weight_scale"))
+        gate_gs = G(f"{e}.gate_proj.weight_scale_2")
+        gate_gss.append(gate_gs)
+        
+        up_ws.append(G(f"{e}.up_proj.weight"))
+        up_sfs.append(G(f"{e}.up_proj.weight_scale"))
+        up_gs = G(f"{e}.up_proj.weight_scale_2")
+        up_gss.append(up_gs)
+        
+        down_ws.append(G(f"{e}.down_proj.weight"))
+        down_sfs.append(G(f"{e}.down_proj.weight_scale"))
+        down_gs = G(f"{e}.down_proj.weight_scale_2")
+        down_gss.append(down_gs)
+        
+        if i % 50 == 0:
+            print(f"    Loaded expert {i}/{num_local_experts}")
+    
+    # Stack into (E, ...) tensors
+    w13_w = torch.stack(gate_ws)  # (E, 3072, 3584)
+    w13_sf = torch.stack(gate_sfs)
+    w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV)
+    
+    # Actually w13 = stacked gate+up, w2 = down
+    # But our runner expects separate L1 (gate+up) and L2 (down)
+    # The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
+    # For CuTeDSLMoERunner, we stack gate and up side-by-side
+    
+    # Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
+    w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1)  # (E, 6144, 3584)
+    w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1)
+    w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0)
+    
+    w2_w = torch.stack(down_ws)
+    w2_sf = torch.stack(down_sfs)
+    w2_gs = torch.stack(down_gss)
+    
+    return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
+
+
+def test_moe_runner(layer_id=2):
+    """Test the CuTeDSLMoERunner with real weights."""
+    from cutedsl.runner import CuTeDSLMoERunner
+    
+    torch.cuda.set_device(0)
+    torch.manual_seed(42)
+    torch.cuda.empty_cache()
+    _cache.clear()
+    
+    with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
+        wm = json.load(f)["weight_map"]
+    G = lambda k: P(k, wm, MODEL).to(DEV)
+    
+    p = f"model.layers.{layer_id}"
+    m = f"{p}.mlp"
+    
+    emb = G("model.embed_tokens.weight")
+    fnorm = G(f"{p}.post_attention_layernorm.weight")
+    
+    print(f"  Packing expert weights (384 experts)...")
+    # This will take a while and use a LOT of memory
+    # Let's use fewer experts for testing
+    num_local_experts = 384
+    
+    # Create the runner first, then prepare weights
+    intermediate_size = INTERMEDIATE  # 3072
+    hidden_size = H  # 7168
+    
+    runner = CuTeDSLMoERunner(
+        num_experts=num_local_experts,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        max_num_tokens=8192,
+        top_k=TOPK,
+        device=str(DEV),
+    )
+    
+    # Load and pack weights
+    print(f"  Loading expert weights...")
+    w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts)
+    
+    print(f"  w13_w: {w13_w.shape}, w2_w: {w2_w.shape}")
+    print(f"  w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}")
+    print(f"  w13 NaN: {torch.isnan(w13_w.float()).any()}")
+    print(f"  w2 NaN: {torch.isnan(w2_w.float()).any()}")
+    
+    # Prepare weights for the runner
+    l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2)
+    l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2)
+    l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf
+    l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf
+    
+    runner.prepare_weights_from_stacked(
+        l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(),
+        l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(),
+    )
+    
+    # Test with various token counts
+    for num_tokens in [1, 4, 8, 16]:
+        token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV)
+        hidden = emb[token_ids]
+        normed = rms(hidden, fnorm, EPS)
+        
+        topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
+        topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
+        
+        print(f"  {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
+        
+        with torch.no_grad():
+            result = runner.run(normed, topk_weights, topk_ids)
+        
+        result_nan = torch.isnan(result).any().item()
+        result_amax = result.amax().item() if not result_nan else -1
+        print(f"  {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}")
+        
+        if result_nan:
+            nan_rows = torch.isnan(result).any(dim=1).sum().item()
+            print(f"  {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN")
+    
+    del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
+    torch.cuda.empty_cache()
+    _cache.clear()
+
+
+def main():
+    print("=" * 70)
+    print("  DeepSeek-V4 MoE Runner NaN Test")
+    print("  Tests CuTeDSLMoERunner (grouped GEMM) with real weights")
+    print("=" * 70)
+    
+    test_moe_runner(layer_id=2)
+    
+    print(f"\n{'='*70}")
+
+
+if __name__ == "__main__":
+    main()