Files
nvfp4-megamoe-kernel/tests/archive/debug_output.py
biondizzle 3fb3c925af Restructure: cutedsl/ -> dsv4/ with proper layering
- Split bridge.py -> ops/quantize.py, ops/layouts.py, ops/gemm_runner.py
- Renamed classes: CuTeDSLNvfp4Linear -> Nvfp4Linear, etc.
- Moved kernel code to dsv4/kernels/ (gemm, attention, compressor, decode, cuda)
- Moved PyTorch bridges to dsv4/ops/
- Moved nn.Module layers to dsv4layers/
- Moved reference implementations to dsv4/reference/
- Moved vendored CUTLASS code to vendored/
- Archived ~190 debug tests to tests/archive/
- Kept ~15 canonical tests in tests/unit/
- Updated all import paths
- Added stubs for future components (model/, cache/, loader/)
- Updated pyproject.toml: dsv4-inference package name
2026-05-21 17:30:44 +00:00

71 lines
2.7 KiB
Python

"""Debug: Compare runner output vs reference pipeline output.
Focus on whether the scale assembly + GEMM produces correct values."""
import torch
import sys
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl')
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm')
from cutedsl.reference.moe_pipeline import moe_pipeline
from vllm.nvfp4_cutedsl import Nvfp4MoE
torch.cuda.set_device(0)
# Load real model weights for layer 0
from cutedsl.weight_loader import load_layer_weights
weights = load_layer_weights(layer_idx=0, num_experts=3)
# Run reference pipeline with dynamic gs
ref_out = moe_pipeline(
hidden_states=torch.randn(4, 256, dtype=torch.bfloat16, device='cuda'),
topk_weights=torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2,
topk_ids=torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda'),
l1_fp4=weights['l1_fp4'],
l1_sf=weights['l1_sf'],
l1_gs=weights['l1_gs'],
l2_fp4=weights['l2_fp4'],
l2_sf=weights['l2_sf'],
l2_gs=weights['l2_gs'],
num_experts=3,
hidden_size=256,
intermediate_size=512,
)
print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}")
# Run runner with warmup gs
runner = Nvfp4MoE(
num_experts=3, hidden_size=256, intermediate_size=512,
max_num_tokens=4, top_k=2, device='cuda'
)
# Set weights directly
runner.l1_fp4 = weights['l1_fp4']
runner.l1_sf = weights['l1_sf']
runner.l1_gs = weights['l1_gs']
runner.l2_fp4 = weights['l2_fp4']
runner.l2_sf = weights['l2_sf']
runner.l2_gs = weights['l2_gs']
# Compute warmup gs
hs = torch.randn(4, 256, dtype=torch.bfloat16, device='cuda')
tw = torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2
ti = torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda')
runner.compute_activation_global_scales(hs, tw, ti)
print(f"Warmup gs: L1={runner._l1_activation_global_scale} L2={runner._l2_activation_global_scale}")
# Run with same input as reference
runner_out = runner.run(hs, tw, ti)
print(f"Runner output: amax={runner_out.amax().item():.4f} mean={runner_out.mean().item():.4f}")
# Cosine similarity
cos = torch.nn.functional.cosine_similarity(ref_out.flatten().unsqueeze(0), runner_out.flatten().unsqueeze(0)).item()
print(f"Cosine similarity: {cos:.6f}")
# Check for NaN/Inf
print(f"Runner NaN: {torch.isnan(runner_out).any().item()} Inf: {torch.isinf(runner_out).any().item()}")
print(f"Ref NaN: {torch.isnan(ref_out).any().item()} Inf: {torch.isinf(ref_out).any().item()}")
# Per-token comparison
for i in range(4):
cos_i = torch.nn.functional.cosine_similarity(ref_out[i].unsqueeze(0), runner_out[i].unsqueeze(0)).item()
print(f" Token {i}: cosine={cos_i:.6f} ref_max={ref_out[i].amax().item():.4f} run_max={runner_out[i].amax().item():.4f}")