From db30c4acd61de0dc0212c40b38db076e5d32827f Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 1 Jun 2026 03:50:53 +0000 Subject: [PATCH] auto: pre-test push for test_se_gpu.py --- test_se_gpu.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 test_se_gpu.py diff --git a/test_se_gpu.py b/test_se_gpu.py new file mode 100644 index 00000000..edf25a2d --- /dev/null +++ b/test_se_gpu.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""Test shared expert on different GPUs.""" +import torch +from dsv4.layers.shared_expert import Nvfp4SharedExpert +from dsv4.ops.quantize import quantize_weight_to_nvfp4 + +torch.manual_seed(42) + +for gpu in [0, 1]: + torch.cuda.set_device(gpu) + dev = f"cuda:{gpu}" + + se = Nvfp4SharedExpert(hidden_size=7168, intermediate_size=3072, device=dev) + + # Create random BF16 weights and quantize to NVFP4 + gate_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev) + up_w = torch.randn(3072, 7168, dtype=torch.bfloat16, device=dev) + down_w = torch.randn(7168, 3072, dtype=torch.bfloat16, device=dev) + + gate_fp4, gate_sf, gate_gs = quantize_weight_to_nvfp4(gate_w) + up_fp4, up_sf, up_gs = quantize_weight_to_nvfp4(up_w) + down_fp4, down_sf, down_gs = quantize_weight_to_nvfp4(down_w) + + se.l1_fp4 = [torch.cat([gate_fp4, up_fp4], dim=0)] + se.l1_sf = [torch.cat([gate_sf, up_sf], dim=0)] + se.l1_gs = [1.0] + se.l2_fp4 = [down_fp4] + se.l2_sf = [down_sf] + se.l2_gs = [1.0] + + # Input + x = torch.randn(1, 7168, dtype=torch.bfloat16, device=dev) + + # Run + out = se.run(x) + has_nan = torch.isnan(out).any().item() + print(f"GPU {gpu}: |out|={out.abs().max().item():.4f} has_nan={has_nan}")