auto: pre-test push for test_se_gpu.py

2026-06-01 03:50:53 +00:00
parent 3dd95ce77b
commit db30c4acd6
1 changed files with 37 additions and 0 deletions
--- a/test_se_gpu.py
+++ b/test_se_gpu.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""Test shared expert on different GPUs."""
+import torch
+from dsv4.layers.shared_expert import Nvfp4SharedExpert
+from dsv4.ops.quantize import quantize_weight_to_nvfp4
+
+torch.manual_seed(42)
+
+for gpu in [0, 1]:
+    torch.cuda.set_device(gpu)
+    dev = f"cuda:{gpu}"
+    
+    se = Nvfp4SharedExpert(hidden_size=7168, intermediate_size=3072, device=dev)
+    
+    # Create random BF16 weights and quantize to NVFP4
+    gate_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev)
+    up_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev)
+    down_w = torch.randn(7168, 3072, dtype=torch.bfloat16, device=dev)
+    
+    gate_fp4, gate_sf, gate_gs = quantize_weight_to_nvfp4(gate_w)
+    up_fp4, up_sf, up_gs = quantize_weight_to_nvfp4(up_w)
+    down_fp4, down_sf, down_gs = quantize_weight_to_nvfp4(down_w)
+    
+    se.l1_fp4 = [torch.cat([gate_fp4, up_fp4], dim=0)]
+    se.l1_sf = [torch.cat([gate_sf, up_sf], dim=0)]
+    se.l1_gs = [1.0]
+    se.l2_fp4 = [down_fp4]
+    se.l2_sf = [down_sf]
+    se.l2_gs = [1.0]
+    
+    # Input
+    x = torch.randn(1, 7168, dtype=torch.bfloat16, device=dev)
+    
+    # Run
+    out = se.run(x)
+    has_nan = torch.isnan(out).any().item()
+    print(f"GPU {gpu}: |out|={out.abs().max().item():.4f} has_nan={has_nan}")