From 22a2fc563eee9cc7ff63fc89e1057516c646fcbc Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Mon, 25 May 2026 16:25:05 +0000
Subject: [PATCH] cleanup: remove diagnostic test file

---
 tests/unit/test_nvfp4_gpu_diag.py | 77 -------------------------------
 1 file changed, 77 deletions(-)
 delete mode 100644 tests/unit/test_nvfp4_gpu_diag.py

diff --git a/tests/unit/test_nvfp4_gpu_diag.py b/tests/unit/test_nvfp4_gpu_diag.py
deleted file mode 100644
index 26b38ea6..00000000
--- a/tests/unit/test_nvfp4_gpu_diag.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""Quick diagnostic: compare GPU vs Python FP4 nibbles element-by-element."""
-import torch
-import sys, os
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-from dsv4.ops.quantize import quantize_activation_nvfp4, quantize_nvfp4_gpu, SF_VEC_SIZE
-
-
-def test():
-    print("=== GPU Quantize Diagnostic ===")
-    torch.manual_seed(42)
-    M, N = 4, 64  # Small case for element-by-element comparison
-    x = torch.randn(M, N, dtype=torch.bfloat16, device='cuda')
-    gs = 1.0
-    
-    ref_fp4, ref_sf = quantize_activation_nvfp4(x, gs)
-    gpu_fp4, gpu_sf = quantize_nvfp4_gpu(x, gs)
-    
-    # Compare SF
-    ref_sf_int = ref_sf.view(torch.uint8)
-    gpu_sf_int = gpu_sf.view(torch.uint8)
-    sf_diff = (ref_sf_int != gpu_sf_int).sum().item()
-    print(f"SF diff bytes: {sf_diff} / {ref_sf_int.numel()}")
-    
-    # Compare FP4 nibble-by-nibble
-    ref_bytes = ref_fp4.view(torch.uint8)
-    gpu_bytes = gpu_fp4.view(torch.uint8)
-    
-    for row in range(M):
-        for col in range(N // 2):
-            rb = ref_bytes[row, col].item()
-            gb = gpu_bytes[row, col].item()
-            if rb != gb:
-                ref_even = rb & 0x0F
-                ref_odd = (rb >> 4) & 0x0F
-                gpu_even = gb & 0x0F
-                gpu_odd = (gb >> 4) & 0x0F
-                print(f"  [{row},{col}] ref=0x{rb:02x} (e={ref_even},o={ref_odd}) gpu=0x{gb:02x} (e={gpu_even},o={gpu_odd})")
-                if row == 0 and col < 4:
-                    # Print the original values
-                    v0 = x[row, col*2].item()
-                    v1 = x[row, col*2+1].item()
-                    print(f"    input: [{v0:.4f}, {v1:.4f}]")
-    
-    # Total match
-    total = ref_bytes.numel()
-    match = (ref_bytes == gpu_bytes).sum().item()
-    print(f"FP4 byte match: {match}/{total} ({100*match/total:.1f}%)")
-    
-    # Dequantize both
-    ref_deq = _dequantize_nvfp4(ref_fp4, ref_sf, gs, N)
-    gpu_deq = _dequantize_nvfp4(gpu_fp4, gpu_sf, gs, N)
-    
-    # Per-element comparison
-    max_diff = (ref_deq - gpu_deq).abs().max().item()
-    mean_diff = (ref_deq - gpu_deq).abs().mean().item()
-    print(f"Max dequant diff: {max_diff:.6f}")
-    print(f"Mean dequant diff: {mean_diff:.6f}")
-
-
-def _dequantize_nvfp4(x_fp4, block_scale, global_scale, N):
-    M = x_fp4.shape[0]
-    raw = x_fp4.view(torch.uint8)
-    even = raw & 0x0F
-    odd = (raw >> 4) & 0x0F
-    indices = torch.stack([even, odd], dim=-1).reshape(M, N)
-    signs = (indices >= 8).float() * -2 + 1
-    mag = indices % 8
-    idx_to_hs = torch.tensor([0,2,4,6,8,10,12,14], dtype=torch.float32, device='cuda')
-    half_steps = idx_to_hs[mag.long()]
-    x_deq = signs * half_steps / 2.0
-    bs_exp = block_scale.repeat_interleave(16, dim=-1).float()
-    x_deq = x_deq * bs_exp * global_scale
-    return x_deq.to(torch.bfloat16)
-
-
-if __name__ == '__main__':
-    test()