From 22a2fc563eee9cc7ff63fc89e1057516c646fcbc Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 25 May 2026 16:25:05 +0000 Subject: [PATCH] cleanup: remove diagnostic test file --- tests/unit/test_nvfp4_gpu_diag.py | 77 ------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 tests/unit/test_nvfp4_gpu_diag.py diff --git a/tests/unit/test_nvfp4_gpu_diag.py b/tests/unit/test_nvfp4_gpu_diag.py deleted file mode 100644 index 26b38ea6..00000000 --- a/tests/unit/test_nvfp4_gpu_diag.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Quick diagnostic: compare GPU vs Python FP4 nibbles element-by-element.""" -import torch -import sys, os -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -from dsv4.ops.quantize import quantize_activation_nvfp4, quantize_nvfp4_gpu, SF_VEC_SIZE - - -def test(): - print("=== GPU Quantize Diagnostic ===") - torch.manual_seed(42) - M, N = 4, 64 # Small case for element-by-element comparison - x = torch.randn(M, N, dtype=torch.bfloat16, device='cuda') - gs = 1.0 - - ref_fp4, ref_sf = quantize_activation_nvfp4(x, gs) - gpu_fp4, gpu_sf = quantize_nvfp4_gpu(x, gs) - - # Compare SF - ref_sf_int = ref_sf.view(torch.uint8) - gpu_sf_int = gpu_sf.view(torch.uint8) - sf_diff = (ref_sf_int != gpu_sf_int).sum().item() - print(f"SF diff bytes: {sf_diff} / {ref_sf_int.numel()}") - - # Compare FP4 nibble-by-nibble - ref_bytes = ref_fp4.view(torch.uint8) - gpu_bytes = gpu_fp4.view(torch.uint8) - - for row in range(M): - for col in range(N // 2): - rb = ref_bytes[row, col].item() - gb = gpu_bytes[row, col].item() - if rb != gb: - ref_even = rb & 0x0F - ref_odd = (rb >> 4) & 0x0F - gpu_even = gb & 0x0F - gpu_odd = (gb >> 4) & 0x0F - print(f" [{row},{col}] ref=0x{rb:02x} (e={ref_even},o={ref_odd}) gpu=0x{gb:02x} (e={gpu_even},o={gpu_odd})") - if row == 0 and col < 4: - # Print the original values - v0 = x[row, col*2].item() - v1 = x[row, col*2+1].item() - print(f" input: [{v0:.4f}, {v1:.4f}]") - - # Total match - total = ref_bytes.numel() - match = (ref_bytes == gpu_bytes).sum().item() - print(f"FP4 byte match: {match}/{total} ({100*match/total:.1f}%)") - - # Dequantize both - ref_deq = _dequantize_nvfp4(ref_fp4, ref_sf, gs, N) - gpu_deq = _dequantize_nvfp4(gpu_fp4, gpu_sf, gs, N) - - # Per-element comparison - max_diff = (ref_deq - gpu_deq).abs().max().item() - mean_diff = (ref_deq - gpu_deq).abs().mean().item() - print(f"Max dequant diff: {max_diff:.6f}") - print(f"Mean dequant diff: {mean_diff:.6f}") - - -def _dequantize_nvfp4(x_fp4, block_scale, global_scale, N): - M = x_fp4.shape[0] - raw = x_fp4.view(torch.uint8) - even = raw & 0x0F - odd = (raw >> 4) & 0x0F - indices = torch.stack([even, odd], dim=-1).reshape(M, N) - signs = (indices >= 8).float() * -2 + 1 - mag = indices % 8 - idx_to_hs = torch.tensor([0,2,4,6,8,10,12,14], dtype=torch.float32, device='cuda') - half_steps = idx_to_hs[mag.long()] - x_deq = signs * half_steps / 2.0 - bs_exp = block_scale.repeat_interleave(16, dim=-1).float() - x_deq = x_deq * bs_exp * global_scale - return x_deq.to(torch.bfloat16) - - -if __name__ == '__main__': - test()