Files
nvfp4-megamoe-kernel/tests/archive/test_moe_runner_nan_b200.py
biondizzle 9cbdc92744 Restructure: cutedsl/ -> dsv4/ with proper layering
- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name
2026-05-21 17:30:44 +00:00

191 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""
DeepSeek-V4 MoE Runner NaN Test
Tests the Nvfp4MoE (grouped GEMM path) with real weights.
The single-expert tests pass — this test exercises the FULL MoE runner
with routing, padding, grouped GEMM, and combine.
Usage (on B200):
cd /root/nvfp4-megamoe-kernel
PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py
"""
import sys, os, json, torch, torch.nn.functional as F
from safetensors import safe_open
REPO = "/root/nvfp4-megamoe-kernel"
sys.path.insert(0, REPO)
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
DEV = "cuda:0"
H = 7168
INTERMEDIATE = 3072
NUM_EXPERTS = 384
TOPK = 6
EPS = 1e-6
_cache = {}
def P(k, wm, md):
if k in _cache: return _cache[k]
with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
t = f.get_tensor(k)
_cache[k] = t
return t
def rms(x, w, eps=1e-6):
v = x.float().pow(2).mean(-1, keepdim=True)
return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
"""Pack per-expert weights into stacked format for Nvfp4MoE.
Only loads the first num_local_experts to fit in memory.
"""
m = f"model.layers.{layer_id}.mlp"
# Load expert weights and stack (only first num_local_experts)
gate_ws, gate_sfs, gate_gss = [], [], []
up_ws, up_sfs, up_gss = [], [], []
down_ws, down_sfs, down_gss = [], [], []
for i in range(num_local_experts):
e = f"{m}.experts.{i}"
gate_ws.append(G(f"{e}.gate_proj.weight"))
gate_sfs.append(G(f"{e}.gate_proj.weight_scale"))
gate_gs = G(f"{e}.gate_proj.weight_scale_2")
gate_gss.append(gate_gs)
up_ws.append(G(f"{e}.up_proj.weight"))
up_sfs.append(G(f"{e}.up_proj.weight_scale"))
up_gs = G(f"{e}.up_proj.weight_scale_2")
up_gss.append(up_gs)
down_ws.append(G(f"{e}.down_proj.weight"))
down_sfs.append(G(f"{e}.down_proj.weight_scale"))
down_gs = G(f"{e}.down_proj.weight_scale_2")
down_gss.append(down_gs)
if i % 50 == 0:
print(f" Loaded expert {i}/{num_local_experts}")
# Stack into (E, ...) tensors
w13_w = torch.stack(gate_ws) # (E, 3072, 3584)
w13_sf = torch.stack(gate_sfs)
w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV)
# Actually w13 = stacked gate+up, w2 = down
# But our runner expects separate L1 (gate+up) and L2 (down)
# The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
# For Nvfp4MoE, we stack gate and up side-by-side
# Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1) # (E, 6144, 3584)
w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1)
w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0)
w2_w = torch.stack(down_ws)
w2_sf = torch.stack(down_sfs)
w2_gs = torch.stack(down_gss)
return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
def test_moe_runner(layer_id=2):
"""Test the Nvfp4MoE with real weights."""
from dsv4.layers.moe import Nvfp4MoE
torch.cuda.set_device(0)
torch.manual_seed(42)
torch.cuda.empty_cache()
_cache.clear()
with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
wm = json.load(f)["weight_map"]
G = lambda k: P(k, wm, MODEL).to(DEV)
p = f"model.layers.{layer_id}"
m = f"{p}.mlp"
emb = G("model.embed_tokens.weight")
fnorm = G(f"{p}.post_attention_layernorm.weight")
print(f" Packing expert weights (384 experts)...")
# Test with fewer experts to fit in memory
num_local_experts = 16 # Use 16 experts (out of 384) for testing
# Create the runner first, then prepare weights
intermediate_size = INTERMEDIATE # 3072
hidden_size = H # 7168
runner = Nvfp4MoE(
num_experts=num_local_experts,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
max_num_tokens=8192,
top_k=TOPK,
device=str(DEV),
)
# Load and pack weights
print(f" Loading expert weights...")
w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts)
print(f" w13_w: {w13_w.shape}, w2_w: {w2_w.shape}")
print(f" w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}")
print(f" w13 NaN: {torch.isnan(w13_w.float()).any()}")
print(f" w2 NaN: {torch.isnan(w2_w.float()).any()}")
# Prepare weights for the runner
l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2)
l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2)
l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf
l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf
runner.prepare_weights_from_stacked(
l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(),
l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(),
)
# Test with various token counts
for num_tokens in [1, 4, 8, 16]:
token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV)
hidden = emb[token_ids]
normed = rms(hidden, fnorm, EPS)
topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
with torch.no_grad():
result = runner.run(normed, topk_weights, topk_ids)
result_nan = torch.isnan(result).any().item()
result_amax = result.amax().item() if not result_nan else -1
print(f" {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}")
if result_nan:
nan_rows = torch.isnan(result).any(dim=1).sum().item()
print(f" {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN")
del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
torch.cuda.empty_cache()
_cache.clear()
def main():
print("=" * 70)
print(" DeepSeek-V4 MoE Runner NaN Test")
print(" Tests Nvfp4MoE (grouped GEMM) with real weights")
print("=" * 70)
test_moe_runner(layer_id=2)
print(f"\n{'='*70}")
if __name__ == "__main__":
main()