Add MoE runner NaN test (grouped GEMM with real weights)
This commit is contained in:
188
tests/test_moe_runner_nan_b200.py
Normal file
188
tests/test_moe_runner_nan_b200.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DeepSeek-V4 MoE Runner NaN Test
|
||||
|
||||
Tests the CuTeDSLMoERunner (grouped GEMM path) with real weights.
|
||||
The single-expert tests pass — this test exercises the FULL MoE runner
|
||||
with routing, padding, grouped GEMM, and combine.
|
||||
|
||||
Usage (on B200):
|
||||
cd /root/nvfp4-megamoe-kernel
|
||||
PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py
|
||||
"""
|
||||
|
||||
import sys, os, json, torch, torch.nn.functional as F
|
||||
from safetensors import safe_open
|
||||
|
||||
REPO = "/root/nvfp4-megamoe-kernel"
|
||||
sys.path.insert(0, REPO)
|
||||
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
||||
DEV = "cuda:0"
|
||||
|
||||
H = 7168
|
||||
INTERMEDIATE = 3072
|
||||
NUM_EXPERTS = 384
|
||||
TOPK = 6
|
||||
EPS = 1e-6
|
||||
|
||||
_cache = {}
|
||||
def P(k, wm, md):
|
||||
if k in _cache: return _cache[k]
|
||||
with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
|
||||
t = f.get_tensor(k)
|
||||
_cache[k] = t
|
||||
return t
|
||||
|
||||
def rms(x, w, eps=1e-6):
|
||||
v = x.float().pow(2).mean(-1, keepdim=True)
|
||||
return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
|
||||
|
||||
|
||||
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=384):
|
||||
"""Pack per-expert weights into stacked format for CuTeDSLMoERunner."""
|
||||
m = f"model.layers.{layer_id}.mlp"
|
||||
|
||||
# Load all expert weights and stack
|
||||
gate_ws, gate_sfs, gate_gss = [], [], []
|
||||
up_ws, up_sfs, up_gss = [], [], []
|
||||
down_ws, down_sfs, down_gss = [], [], []
|
||||
|
||||
for i in range(num_local_experts):
|
||||
e = f"{m}.experts.{i}"
|
||||
gate_ws.append(G(f"{e}.gate_proj.weight"))
|
||||
gate_sfs.append(G(f"{e}.gate_proj.weight_scale"))
|
||||
gate_gs = G(f"{e}.gate_proj.weight_scale_2")
|
||||
gate_gss.append(gate_gs)
|
||||
|
||||
up_ws.append(G(f"{e}.up_proj.weight"))
|
||||
up_sfs.append(G(f"{e}.up_proj.weight_scale"))
|
||||
up_gs = G(f"{e}.up_proj.weight_scale_2")
|
||||
up_gss.append(up_gs)
|
||||
|
||||
down_ws.append(G(f"{e}.down_proj.weight"))
|
||||
down_sfs.append(G(f"{e}.down_proj.weight_scale"))
|
||||
down_gs = G(f"{e}.down_proj.weight_scale_2")
|
||||
down_gss.append(down_gs)
|
||||
|
||||
if i % 50 == 0:
|
||||
print(f" Loaded expert {i}/{num_local_experts}")
|
||||
|
||||
# Stack into (E, ...) tensors
|
||||
w13_w = torch.stack(gate_ws) # (E, 3072, 3584)
|
||||
w13_sf = torch.stack(gate_sfs)
|
||||
w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV)
|
||||
|
||||
# Actually w13 = stacked gate+up, w2 = down
|
||||
# But our runner expects separate L1 (gate+up) and L2 (down)
|
||||
# The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
|
||||
# For CuTeDSLMoERunner, we stack gate and up side-by-side
|
||||
|
||||
# Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
|
||||
w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1) # (E, 6144, 3584)
|
||||
w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1)
|
||||
w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0)
|
||||
|
||||
w2_w = torch.stack(down_ws)
|
||||
w2_sf = torch.stack(down_sfs)
|
||||
w2_gs = torch.stack(down_gss)
|
||||
|
||||
return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
|
||||
|
||||
|
||||
def test_moe_runner(layer_id=2):
|
||||
"""Test the CuTeDSLMoERunner with real weights."""
|
||||
from cutedsl.runner import CuTeDSLMoERunner
|
||||
|
||||
torch.cuda.set_device(0)
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.empty_cache()
|
||||
_cache.clear()
|
||||
|
||||
with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
|
||||
wm = json.load(f)["weight_map"]
|
||||
G = lambda k: P(k, wm, MODEL).to(DEV)
|
||||
|
||||
p = f"model.layers.{layer_id}"
|
||||
m = f"{p}.mlp"
|
||||
|
||||
emb = G("model.embed_tokens.weight")
|
||||
fnorm = G(f"{p}.post_attention_layernorm.weight")
|
||||
|
||||
print(f" Packing expert weights (384 experts)...")
|
||||
# This will take a while and use a LOT of memory
|
||||
# Let's use fewer experts for testing
|
||||
num_local_experts = 384
|
||||
|
||||
# Create the runner first, then prepare weights
|
||||
intermediate_size = INTERMEDIATE # 3072
|
||||
hidden_size = H # 7168
|
||||
|
||||
runner = CuTeDSLMoERunner(
|
||||
num_experts=num_local_experts,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
max_num_tokens=8192,
|
||||
top_k=TOPK,
|
||||
device=str(DEV),
|
||||
)
|
||||
|
||||
# Load and pack weights
|
||||
print(f" Loading expert weights...")
|
||||
w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts)
|
||||
|
||||
print(f" w13_w: {w13_w.shape}, w2_w: {w2_w.shape}")
|
||||
print(f" w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}")
|
||||
print(f" w13 NaN: {torch.isnan(w13_w.float()).any()}")
|
||||
print(f" w2 NaN: {torch.isnan(w2_w.float()).any()}")
|
||||
|
||||
# Prepare weights for the runner
|
||||
l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2)
|
||||
l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2)
|
||||
l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf
|
||||
l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf
|
||||
|
||||
runner.prepare_weights_from_stacked(
|
||||
l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(),
|
||||
l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(),
|
||||
)
|
||||
|
||||
# Test with various token counts
|
||||
for num_tokens in [1, 4, 8, 16]:
|
||||
token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV)
|
||||
hidden = emb[token_ids]
|
||||
normed = rms(hidden, fnorm, EPS)
|
||||
|
||||
topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
|
||||
topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
|
||||
|
||||
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
|
||||
|
||||
with torch.no_grad():
|
||||
result = runner.run(normed, topk_weights, topk_ids)
|
||||
|
||||
result_nan = torch.isnan(result).any().item()
|
||||
result_amax = result.amax().item() if not result_nan else -1
|
||||
print(f" {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}")
|
||||
|
||||
if result_nan:
|
||||
nan_rows = torch.isnan(result).any(dim=1).sum().item()
|
||||
print(f" {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN")
|
||||
|
||||
del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
|
||||
torch.cuda.empty_cache()
|
||||
_cache.clear()
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print(" DeepSeek-V4 MoE Runner NaN Test")
|
||||
print(" Tests CuTeDSLMoERunner (grouped GEMM) with real weights")
|
||||
print("=" * 70)
|
||||
|
||||
test_moe_runner(layer_id=2)
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user