"""Debug: Compare runner output vs reference pipeline output. Focus on whether the scale assembly + GEMM produces correct values.""" import torch import sys sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl') sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm') from cutedsl.reference.moe_pipeline import moe_pipeline from vllm.nvfp4_cutedsl import CuTeDSLMoERunner torch.cuda.set_device(0) # Load real model weights for layer 0 from cutedsl.weight_loader import load_layer_weights weights = load_layer_weights(layer_idx=0, num_experts=3) # Run reference pipeline with dynamic gs ref_out = moe_pipeline( hidden_states=torch.randn(4, 256, dtype=torch.bfloat16, device='cuda'), topk_weights=torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2, topk_ids=torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda'), l1_fp4=weights['l1_fp4'], l1_sf=weights['l1_sf'], l1_gs=weights['l1_gs'], l2_fp4=weights['l2_fp4'], l2_sf=weights['l2_sf'], l2_gs=weights['l2_gs'], num_experts=3, hidden_size=256, intermediate_size=512, ) print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}") # Run runner with warmup gs runner = CuTeDSLMoERunner( num_experts=3, hidden_size=256, intermediate_size=512, max_num_tokens=4, top_k=2, device='cuda' ) # Set weights directly runner.l1_fp4 = weights['l1_fp4'] runner.l1_sf = weights['l1_sf'] runner.l1_gs = weights['l1_gs'] runner.l2_fp4 = weights['l2_fp4'] runner.l2_sf = weights['l2_sf'] runner.l2_gs = weights['l2_gs'] # Compute warmup gs hs = torch.randn(4, 256, dtype=torch.bfloat16, device='cuda') tw = torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2 ti = torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda') runner.compute_activation_global_scales(hs, tw, ti) print(f"Warmup gs: L1={runner._l1_activation_global_scale} L2={runner._l2_activation_global_scale}") # Run with same input as reference runner_out = runner.run(hs, tw, ti) print(f"Runner output: amax={runner_out.amax().item():.4f} mean={runner_out.mean().item():.4f}") # Cosine similarity cos = torch.nn.functional.cosine_similarity(ref_out.flatten().unsqueeze(0), runner_out.flatten().unsqueeze(0)).item() print(f"Cosine similarity: {cos:.6f}") # Check for NaN/Inf print(f"Runner NaN: {torch.isnan(runner_out).any().item()} Inf: {torch.isinf(runner_out).any().item()}") print(f"Ref NaN: {torch.isnan(ref_out).any().item()} Inf: {torch.isinf(ref_out).any().item()}") # Per-token comparison for i in range(4): cos_i = torch.nn.functional.cosine_similarity(ref_out[i].unsqueeze(0), runner_out[i].unsqueeze(0)).item() print(f" Token {i}: cosine={cos_i:.6f} ref_max={ref_out[i].amax().item():.4f} run_max={runner_out[i].amax().item():.4f}")