Files
nvfp4-megamoe-kernel/tests/integration/test_se_gpu.py
biondizzle 8de47e26ce Cleanup Step 1: Move root-level files to proper directories
- Move test_*.py → tests/integration/
- Move probe_*.py, dump_*.py → helpers/
- Move PERFORMANCE_AUDIT.md → docs/
- Move single_shot_PYTORCH_REFERENCE.py → dsv4/reference/
- Fix 3 import references in test_layer_comparison, test_mhc_comparison, test_compressor_position_bias
- Add helpers/import_closure.py (dead-code detection tool)
2026-06-02 19:24:39 +00:00

38 lines
1.3 KiB
Python

#!/usr/bin/env python3
"""Test shared expert on different GPUs."""
import torch
from dsv4.layers.shared_expert import Nvfp4SharedExpert
from dsv4.ops.quantize import quantize_weight_to_nvfp4
torch.manual_seed(42)
for gpu in [0, 1]:
torch.cuda.set_device(gpu)
dev = f"cuda:{gpu}"
se = Nvfp4SharedExpert(hidden_size=7168, intermediate_size=3072, device=dev)
# Create random BF16 weights and quantize to NVFP4
gate_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev)
up_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev)
down_w = torch.randn(7168, 3072, dtype=torch.bfloat16, device=dev)
gate_fp4, gate_sf, gate_gs = quantize_weight_to_nvfp4(gate_w)
up_fp4, up_sf, up_gs = quantize_weight_to_nvfp4(up_w)
down_fp4, down_sf, down_gs = quantize_weight_to_nvfp4(down_w)
se.l1_fp4 = [torch.cat([gate_fp4, up_fp4], dim=0)]
se.l1_sf = [torch.cat([gate_sf, up_sf], dim=0)]
se.l1_gs = [1.0]
se.l2_fp4 = [down_fp4]
se.l2_sf = [down_sf]
se.l2_gs = [1.0]
# Input
x = torch.randn(1, 7168, dtype=torch.bfloat16, device=dev)
# Run
out = se.run(x)
has_nan = torch.isnan(out).any().item()
print(f"GPU {gpu}: |out|={out.abs().max().item():.4f} has_nan={has_nan}")