"""Debug: Compare runner output vs reference pipeline output.
Focus on whether the scale assembly + GEMM produces correct values."""
import torch
import sys
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl')
sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm')

from cutedsl.reference.moe_pipeline import moe_pipeline
from vllm.nvfp4_cutedsl import CuTeDSLMoERunner

torch.cuda.set_device(0)

# Load real model weights for layer 0
from cutedsl.weight_loader import load_layer_weights
weights = load_layer_weights(layer_idx=0, num_experts=3)

# Run reference pipeline with dynamic gs
ref_out = moe_pipeline(
    hidden_states=torch.randn(4, 256, dtype=torch.bfloat16, device='cuda'),
    topk_weights=torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2,
    topk_ids=torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda'),
    l1_fp4=weights['l1_fp4'],
    l1_sf=weights['l1_sf'],
    l1_gs=weights['l1_gs'],
    l2_fp4=weights['l2_fp4'],
    l2_sf=weights['l2_sf'],
    l2_gs=weights['l2_gs'],
    num_experts=3,
    hidden_size=256,
    intermediate_size=512,
)

print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}")

# Run runner with warmup gs
runner = CuTeDSLMoERunner(
    num_experts=3, hidden_size=256, intermediate_size=512,
    max_num_tokens=4, top_k=2, device='cuda'
)
# Set weights directly
runner.l1_fp4 = weights['l1_fp4']
runner.l1_sf = weights['l1_sf']
runner.l1_gs = weights['l1_gs']
runner.l2_fp4 = weights['l2_fp4']
runner.l2_sf = weights['l2_sf']
runner.l2_gs = weights['l2_gs']

# Compute warmup gs
hs = torch.randn(4, 256, dtype=torch.bfloat16, device='cuda')
tw = torch.ones(4, 2, dtype=torch.float32, device='cuda') / 2
ti = torch.tensor([[0,1],[0,1],[0,1],[0,1]], dtype=torch.int64, device='cuda')
runner.compute_activation_global_scales(hs, tw, ti)
print(f"Warmup gs: L1={runner._l1_activation_global_scale} L2={runner._l2_activation_global_scale}")

# Run with same input as reference
runner_out = runner.run(hs, tw, ti)
print(f"Runner output: amax={runner_out.amax().item():.4f} mean={runner_out.mean().item():.4f}")

# Cosine similarity
cos = torch.nn.functional.cosine_similarity(ref_out.flatten().unsqueeze(0), runner_out.flatten().unsqueeze(0)).item()
print(f"Cosine similarity: {cos:.6f}")

# Check for NaN/Inf
print(f"Runner NaN: {torch.isnan(runner_out).any().item()} Inf: {torch.isinf(runner_out).any().item()}")
print(f"Ref NaN: {torch.isnan(ref_out).any().item()} Inf: {torch.isinf(ref_out).any().item()}")

# Per-token comparison
for i in range(4):
    cos_i = torch.nn.functional.cosine_similarity(ref_out[i].unsqueeze(0), runner_out[i].unsqueeze(0)).item()
    print(f"  Token {i}: cosine={cos_i:.6f} ref_max={ref_out[i].amax().item():.4f} run_max={runner_out[i].amax().item():.4f}")