#!/usr/bin/env python3
"""Test: run_nvfp4_grouped_gemm with 1 expert on different GPUs."""
import torch
from dsv4.ops.gemm_runner import run_nvfp4_grouped_gemm
from dsv4.ops.quantize import quantize_nvfp4_gpu, quantize_weight_to_nvfp4
from dsv4.ops.layouts import make_b_k_major, assemble_scales_3d_side

torch.manual_seed(42)

M, N, K = 1, 3072, 7168

for gpu in [0, 1]:
    torch.cuda.set_device(gpu)
    dev = f"cuda:{gpu}"
    
    w = torch.randn(N, K, dtype=torch.bfloat16, device=dev)
    w_fp4, w_sf, w_gs = quantize_weight_to_nvfp4(w)
    
    # K-major layout (1 expert)
    w_km = make_b_k_major(w_fp4.unsqueeze(0))  # (1, K_sf, N)
    w_sf_3d = assemble_scales_3d_side(w_sf.unsqueeze(0))  # (1, K_sf_padded, N)
    
    # Activation
    x = torch.randn(128, K, dtype=torch.bfloat16, device=dev)  # padded to 128
    gsa = 1.0 / (6.0 * 448.0)
    x_fp4, x_sf = quantize_nvfp4_gpu(x, gsa)
    
    # Expert offsets (1 expert, 128 rows)
    expert_offsets = torch.tensor([128], dtype=torch.int32, device=dev)
    
    # Global scales
    gsa_buf = torch.tensor([gsa], dtype=torch.float32, device=dev)
    gsb = torch.tensor([1.0], dtype=torch.float32, device=dev)
    
    # Run
    out = run_nvfp4_grouped_gemm(
        mat_a=x_fp4,
        scale_a=x_sf,
        mat_b=w_km,
        scale_b=w_sf_3d,
        expert_offsets=expert_offsets,
        global_scale_a=gsa_buf,
        global_scale_b=gsb,
    )
    
    has_nan = torch.isnan(out[:M]).any().item()
    print(f"GPU {gpu}: |out|={out[:M].abs().max().item() if not has_nan else 'NaN'} has_nan={has_nan} shape={out.shape}")