cleanup: remove diagnostic test file

This commit is contained in:
2026-05-25 16:25:05 +00:00
parent a064b99d3d
commit 22a2fc563e

View File

@@ -1,77 +0,0 @@
"""Quick diagnostic: compare GPU vs Python FP4 nibbles element-by-element."""
import torch
import sys, os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dsv4.ops.quantize import quantize_activation_nvfp4, quantize_nvfp4_gpu, SF_VEC_SIZE
def test():
print("=== GPU Quantize Diagnostic ===")
torch.manual_seed(42)
M, N = 4, 64 # Small case for element-by-element comparison
x = torch.randn(M, N, dtype=torch.bfloat16, device='cuda')
gs = 1.0
ref_fp4, ref_sf = quantize_activation_nvfp4(x, gs)
gpu_fp4, gpu_sf = quantize_nvfp4_gpu(x, gs)
# Compare SF
ref_sf_int = ref_sf.view(torch.uint8)
gpu_sf_int = gpu_sf.view(torch.uint8)
sf_diff = (ref_sf_int != gpu_sf_int).sum().item()
print(f"SF diff bytes: {sf_diff} / {ref_sf_int.numel()}")
# Compare FP4 nibble-by-nibble
ref_bytes = ref_fp4.view(torch.uint8)
gpu_bytes = gpu_fp4.view(torch.uint8)
for row in range(M):
for col in range(N // 2):
rb = ref_bytes[row, col].item()
gb = gpu_bytes[row, col].item()
if rb != gb:
ref_even = rb & 0x0F
ref_odd = (rb >> 4) & 0x0F
gpu_even = gb & 0x0F
gpu_odd = (gb >> 4) & 0x0F
print(f" [{row},{col}] ref=0x{rb:02x} (e={ref_even},o={ref_odd}) gpu=0x{gb:02x} (e={gpu_even},o={gpu_odd})")
if row == 0 and col < 4:
# Print the original values
v0 = x[row, col*2].item()
v1 = x[row, col*2+1].item()
print(f" input: [{v0:.4f}, {v1:.4f}]")
# Total match
total = ref_bytes.numel()
match = (ref_bytes == gpu_bytes).sum().item()
print(f"FP4 byte match: {match}/{total} ({100*match/total:.1f}%)")
# Dequantize both
ref_deq = _dequantize_nvfp4(ref_fp4, ref_sf, gs, N)
gpu_deq = _dequantize_nvfp4(gpu_fp4, gpu_sf, gs, N)
# Per-element comparison
max_diff = (ref_deq - gpu_deq).abs().max().item()
mean_diff = (ref_deq - gpu_deq).abs().mean().item()
print(f"Max dequant diff: {max_diff:.6f}")
print(f"Mean dequant diff: {mean_diff:.6f}")
def _dequantize_nvfp4(x_fp4, block_scale, global_scale, N):
M = x_fp4.shape[0]
raw = x_fp4.view(torch.uint8)
even = raw & 0x0F
odd = (raw >> 4) & 0x0F
indices = torch.stack([even, odd], dim=-1).reshape(M, N)
signs = (indices >= 8).float() * -2 + 1
mag = indices % 8
idx_to_hs = torch.tensor([0,2,4,6,8,10,12,14], dtype=torch.float32, device='cuda')
half_steps = idx_to_hs[mag.long()]
x_deq = signs * half_steps / 2.0
bs_exp = block_scale.repeat_interleave(16, dim=-1).float()
x_deq = x_deq * bs_exp * global_scale
return x_deq.to(torch.bfloat16)
if __name__ == '__main__':
test()