- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py - Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc. - Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda) - Moved PyTorch bridges to dsv4/ops/ - Moved nn.Module layers to dsv4layers/ - Moved reference implementations to dsv4/reference/ - Moved vendored CUTLASS code to vendored/ - Archived ~190 debug tests to tests/archive/ - Kept ~15 canonical tests in tests/unit/ - Updated all import paths - Added stubs for future components (model/, cache/, loader/) - Updated pyproject.toml: dsv4-inference package name
71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
"""Debug: Compare runner output vs reference pipeline output.
|
|
Focus on whether the scale assembly + GEMM produces correct values."""
|
|
import torch
|
|
import sys
|
|
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl')
|
|
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm')
|
|
|
|
from cutedsl.reference.moe_pipeline import moe_pipeline
|
|
from vllm.nvfp4_cutedsl import Nvfp4MoE
|
|
|
|
torch.cuda.set_device(0)
|
|
|
|
# Load real model weights for layer 0
|
|
from cutedsl.weight_loader import load_layer_weights
|
|
weights = load_layer_weights(layer_idx=0, num_experts=3)
|
|
|
|
# Run reference pipeline with dynamic gs
|
|
ref_out = moe_pipeline(
|
|
hidden_states=torch.randn(4, 256, dtype=torch.bfloat16, device='cuda'),
|
|
topk_weights=torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2,
|
|
topk_ids=torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda'),
|
|
l1_fp4=weights['l1_fp4'],
|
|
l1_sf=weights['l1_sf'],
|
|
l1_gs=weights['l1_gs'],
|
|
l2_fp4=weights['l2_fp4'],
|
|
l2_sf=weights['l2_sf'],
|
|
l2_gs=weights['l2_gs'],
|
|
num_experts=3,
|
|
hidden_size=256,
|
|
intermediate_size=512,
|
|
)
|
|
|
|
print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}")
|
|
|
|
# Run runner with warmup gs
|
|
runner = Nvfp4MoE(
|
|
num_experts=3, hidden_size=256, intermediate_size=512,
|
|
max_num_tokens=4, top_k=2, device='cuda'
|
|
)
|
|
# Set weights directly
|
|
runner.l1_fp4 = weights['l1_fp4']
|
|
runner.l1_sf = weights['l1_sf']
|
|
runner.l1_gs = weights['l1_gs']
|
|
runner.l2_fp4 = weights['l2_fp4']
|
|
runner.l2_sf = weights['l2_sf']
|
|
runner.l2_gs = weights['l2_gs']
|
|
|
|
# Compute warmup gs
|
|
hs = torch.randn(4, 256, dtype=torch.bfloat16, device='cuda')
|
|
tw = torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2
|
|
ti = torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda')
|
|
runner.compute_activation_global_scales(hs, tw, ti)
|
|
print(f"Warmup gs: L1={runner._l1_activation_global_scale} L2={runner._l2_activation_global_scale}")
|
|
|
|
# Run with same input as reference
|
|
runner_out = runner.run(hs, tw, ti)
|
|
print(f"Runner output: amax={runner_out.amax().item():.4f} mean={runner_out.mean().item():.4f}")
|
|
|
|
# Cosine similarity
|
|
cos = torch.nn.functional.cosine_similarity(ref_out.flatten().unsqueeze(0), runner_out.flatten().unsqueeze(0)).item()
|
|
print(f"Cosine similarity: {cos:.6f}")
|
|
|
|
# Check for NaN/Inf
|
|
print(f"Runner NaN: {torch.isnan(runner_out).any().item()} Inf: {torch.isinf(runner_out).any().item()}")
|
|
print(f"Ref NaN: {torch.isnan(ref_out).any().item()} Inf: {torch.isinf(ref_out).any().item()}")
|
|
|
|
# Per-token comparison
|
|
for i in range(4):
|
|
cos_i = torch.nn.functional.cosine_similarity(ref_out[i].unsqueeze(0), runner_out[i].unsqueeze(0)).item()
|
|
print(f" Token {i}: cosine={cos_i:.6f} ref_max={ref_out[i].amax().item():.4f} run_max={runner_out[i].amax().item():.4f}")
|