- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py - Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc. - Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda) - Moved PyTorch bridges to dsv4/ops/ - Moved nn.Module layers to dsv4layers/ - Moved reference implementations to dsv4/reference/ - Moved vendored CUTLASS code to vendored/ - Archived ~190 debug tests to tests/archive/ - Kept ~15 canonical tests in tests/unit/ - Updated all import paths - Added stubs for future components (model/, cache/, loader/) - Updated pyproject.toml: dsv4-inference package name
191 lines
6.6 KiB
Python
191 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
DeepSeek-V4 MoE Runner NaN Test
|
|
|
|
Tests the Nvfp4MoE (grouped GEMM path) with real weights.
|
|
The single-expert tests pass — this test exercises the FULL MoE runner
|
|
with routing, padding, grouped GEMM, and combine.
|
|
|
|
Usage (on B200):
|
|
cd /root/nvfp4-megamoe-kernel
|
|
PYTHONPATH=/root/nvfp4-megamoe-kernel tests/venv/bin/python tests/test_moe_runner_nan_b200.py
|
|
"""
|
|
|
|
import sys, os, json, torch, torch.nn.functional as F
|
|
from safetensors import safe_open
|
|
|
|
REPO = "/root/nvfp4-megamoe-kernel"
|
|
sys.path.insert(0, REPO)
|
|
MODEL = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
|
|
DEV = "cuda:0"
|
|
|
|
H = 7168
|
|
INTERMEDIATE = 3072
|
|
NUM_EXPERTS = 384
|
|
TOPK = 6
|
|
EPS = 1e-6
|
|
|
|
_cache = {}
|
|
def P(k, wm, md):
|
|
if k in _cache: return _cache[k]
|
|
with safe_open(os.path.join(md, wm[k]), framework="pt") as f:
|
|
t = f.get_tensor(k)
|
|
_cache[k] = t
|
|
return t
|
|
|
|
def rms(x, w, eps=1e-6):
|
|
v = x.float().pow(2).mean(-1, keepdim=True)
|
|
return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
|
|
|
|
|
|
def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
|
|
"""Pack per-expert weights into stacked format for Nvfp4MoE.
|
|
Only loads the first num_local_experts to fit in memory.
|
|
"""
|
|
m = f"model.layers.{layer_id}.mlp"
|
|
|
|
# Load expert weights and stack (only first num_local_experts)
|
|
gate_ws, gate_sfs, gate_gss = [], [], []
|
|
up_ws, up_sfs, up_gss = [], [], []
|
|
down_ws, down_sfs, down_gss = [], [], []
|
|
|
|
for i in range(num_local_experts):
|
|
e = f"{m}.experts.{i}"
|
|
gate_ws.append(G(f"{e}.gate_proj.weight"))
|
|
gate_sfs.append(G(f"{e}.gate_proj.weight_scale"))
|
|
gate_gs = G(f"{e}.gate_proj.weight_scale_2")
|
|
gate_gss.append(gate_gs)
|
|
|
|
up_ws.append(G(f"{e}.up_proj.weight"))
|
|
up_sfs.append(G(f"{e}.up_proj.weight_scale"))
|
|
up_gs = G(f"{e}.up_proj.weight_scale_2")
|
|
up_gss.append(up_gs)
|
|
|
|
down_ws.append(G(f"{e}.down_proj.weight"))
|
|
down_sfs.append(G(f"{e}.down_proj.weight_scale"))
|
|
down_gs = G(f"{e}.down_proj.weight_scale_2")
|
|
down_gss.append(down_gs)
|
|
|
|
if i % 50 == 0:
|
|
print(f" Loaded expert {i}/{num_local_experts}")
|
|
|
|
# Stack into (E, ...) tensors
|
|
w13_w = torch.stack(gate_ws) # (E, 3072, 3584)
|
|
w13_sf = torch.stack(gate_sfs)
|
|
w13_gs = torch.stack(gate_gss) if gate_gss[0].dim() > 0 else torch.tensor([g.item() for g in gate_gss], device=DEV)
|
|
|
|
# Actually w13 = stacked gate+up, w2 = down
|
|
# But our runner expects separate L1 (gate+up) and L2 (down)
|
|
# The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
|
|
# For Nvfp4MoE, we stack gate and up side-by-side
|
|
|
|
# Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
|
|
w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1) # (E, 6144, 3584)
|
|
w13_sf = torch.cat([torch.stack(gate_sfs), torch.stack(up_sfs)], dim=1)
|
|
w13_gs = torch.cat([torch.stack(gate_gss), torch.stack(up_gss)], dim=0)
|
|
|
|
w2_w = torch.stack(down_ws)
|
|
w2_sf = torch.stack(down_sfs)
|
|
w2_gs = torch.stack(down_gss)
|
|
|
|
return w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
|
|
|
|
|
|
def test_moe_runner(layer_id=2):
|
|
"""Test the Nvfp4MoE with real weights."""
|
|
from dsv4.layers.moe import Nvfp4MoE
|
|
|
|
torch.cuda.set_device(0)
|
|
torch.manual_seed(42)
|
|
torch.cuda.empty_cache()
|
|
_cache.clear()
|
|
|
|
with open(os.path.join(MODEL, "model.safetensors.index.json")) as f:
|
|
wm = json.load(f)["weight_map"]
|
|
G = lambda k: P(k, wm, MODEL).to(DEV)
|
|
|
|
p = f"model.layers.{layer_id}"
|
|
m = f"{p}.mlp"
|
|
|
|
emb = G("model.embed_tokens.weight")
|
|
fnorm = G(f"{p}.post_attention_layernorm.weight")
|
|
|
|
print(f" Packing expert weights (384 experts)...")
|
|
# Test with fewer experts to fit in memory
|
|
num_local_experts = 16 # Use 16 experts (out of 384) for testing
|
|
|
|
# Create the runner first, then prepare weights
|
|
intermediate_size = INTERMEDIATE # 3072
|
|
hidden_size = H # 7168
|
|
|
|
runner = Nvfp4MoE(
|
|
num_experts=num_local_experts,
|
|
hidden_size=hidden_size,
|
|
intermediate_size=intermediate_size,
|
|
max_num_tokens=8192,
|
|
top_k=TOPK,
|
|
device=str(DEV),
|
|
)
|
|
|
|
# Load and pack weights
|
|
print(f" Loading expert weights...")
|
|
w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs = pack_expert_weights(wm, G, layer_id, num_local_experts)
|
|
|
|
print(f" w13_w: {w13_w.shape}, w2_w: {w2_w.shape}")
|
|
print(f" w13_gs: {w13_gs.shape}, w2_gs: {w2_gs.shape}")
|
|
print(f" w13 NaN: {torch.isnan(w13_w.float()).any()}")
|
|
print(f" w2 NaN: {torch.isnan(w2_w.float()).any()}")
|
|
|
|
# Prepare weights for the runner
|
|
l1_fp4 = w13_w.view(torch.float4_e2m1fn_x2)
|
|
l2_fp4 = w2_w.view(torch.float4_e2m1fn_x2)
|
|
l1_sf = w13_sf.to(torch.float8_e4m3fn) if w13_sf.dtype != torch.float8_e4m3fn else w13_sf
|
|
l2_sf = w2_sf.to(torch.float8_e4m3fn) if w2_sf.dtype != torch.float8_e4m3fn else w2_sf
|
|
|
|
runner.prepare_weights_from_stacked(
|
|
l1_fp4, l1_sf, w13_gs.tolist() if w13_gs.dim() == 1 else w13_gs.flatten().tolist(),
|
|
l2_fp4, l2_sf, w2_gs.tolist() if w2_gs.dim() == 1 else w2_gs.flatten().tolist(),
|
|
)
|
|
|
|
# Test with various token counts
|
|
for num_tokens in [1, 4, 8, 16]:
|
|
token_ids = torch.randint(1, 1000, (num_tokens,), dtype=torch.long, device=DEV)
|
|
hidden = emb[token_ids]
|
|
normed = rms(hidden, fnorm, EPS)
|
|
|
|
topk_ids = torch.randint(0, num_local_experts, (num_tokens, TOPK), device=DEV)
|
|
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
|
|
topk_weights = torch.softmax(torch.randn(num_tokens, TOPK, device=DEV), dim=-1)
|
|
|
|
print(f" {num_tokens} tokens: input amax={normed.amax():.4f} NaN={torch.isnan(normed).any()}")
|
|
|
|
with torch.no_grad():
|
|
result = runner.run(normed, topk_weights, topk_ids)
|
|
|
|
result_nan = torch.isnan(result).any().item()
|
|
result_amax = result.amax().item() if not result_nan else -1
|
|
print(f" {num_tokens} tokens: output amax={result_amax:.4f} NaN={result_nan}")
|
|
|
|
if result_nan:
|
|
nan_rows = torch.isnan(result).any(dim=1).sum().item()
|
|
print(f" {num_tokens} tokens: {nan_rows}/{num_tokens} rows have NaN")
|
|
|
|
del runner, w13_w, w13_sf, w13_gs, w2_w, w2_sf, w2_gs
|
|
torch.cuda.empty_cache()
|
|
_cache.clear()
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print(" DeepSeek-V4 MoE Runner NaN Test")
|
|
print(" Tests Nvfp4MoE (grouped GEMM) with real weights")
|
|
print("=" * 70)
|
|
|
|
test_moe_runner(layer_id=2)
|
|
|
|
print(f"\n{'='*70}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|