cleanup: remove diagnostic test file
This commit is contained in:
@@ -1,77 +0,0 @@
|
||||
"""Quick diagnostic: compare GPU vs Python FP4 nibbles element-by-element."""
|
||||
import torch
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
from dsv4.ops.quantize import quantize_activation_nvfp4, quantize_nvfp4_gpu, SF_VEC_SIZE
|
||||
|
||||
|
||||
def test():
|
||||
print("=== GPU Quantize Diagnostic ===")
|
||||
torch.manual_seed(42)
|
||||
M, N = 4, 64 # Small case for element-by-element comparison
|
||||
x = torch.randn(M, N, dtype=torch.bfloat16, device='cuda')
|
||||
gs = 1.0
|
||||
|
||||
ref_fp4, ref_sf = quantize_activation_nvfp4(x, gs)
|
||||
gpu_fp4, gpu_sf = quantize_nvfp4_gpu(x, gs)
|
||||
|
||||
# Compare SF
|
||||
ref_sf_int = ref_sf.view(torch.uint8)
|
||||
gpu_sf_int = gpu_sf.view(torch.uint8)
|
||||
sf_diff = (ref_sf_int != gpu_sf_int).sum().item()
|
||||
print(f"SF diff bytes: {sf_diff} / {ref_sf_int.numel()}")
|
||||
|
||||
# Compare FP4 nibble-by-nibble
|
||||
ref_bytes = ref_fp4.view(torch.uint8)
|
||||
gpu_bytes = gpu_fp4.view(torch.uint8)
|
||||
|
||||
for row in range(M):
|
||||
for col in range(N // 2):
|
||||
rb = ref_bytes[row, col].item()
|
||||
gb = gpu_bytes[row, col].item()
|
||||
if rb != gb:
|
||||
ref_even = rb & 0x0F
|
||||
ref_odd = (rb >> 4) & 0x0F
|
||||
gpu_even = gb & 0x0F
|
||||
gpu_odd = (gb >> 4) & 0x0F
|
||||
print(f" [{row},{col}] ref=0x{rb:02x} (e={ref_even},o={ref_odd}) gpu=0x{gb:02x} (e={gpu_even},o={gpu_odd})")
|
||||
if row == 0 and col < 4:
|
||||
# Print the original values
|
||||
v0 = x[row, col*2].item()
|
||||
v1 = x[row, col*2+1].item()
|
||||
print(f" input: [{v0:.4f}, {v1:.4f}]")
|
||||
|
||||
# Total match
|
||||
total = ref_bytes.numel()
|
||||
match = (ref_bytes == gpu_bytes).sum().item()
|
||||
print(f"FP4 byte match: {match}/{total} ({100*match/total:.1f}%)")
|
||||
|
||||
# Dequantize both
|
||||
ref_deq = _dequantize_nvfp4(ref_fp4, ref_sf, gs, N)
|
||||
gpu_deq = _dequantize_nvfp4(gpu_fp4, gpu_sf, gs, N)
|
||||
|
||||
# Per-element comparison
|
||||
max_diff = (ref_deq - gpu_deq).abs().max().item()
|
||||
mean_diff = (ref_deq - gpu_deq).abs().mean().item()
|
||||
print(f"Max dequant diff: {max_diff:.6f}")
|
||||
print(f"Mean dequant diff: {mean_diff:.6f}")
|
||||
|
||||
|
||||
def _dequantize_nvfp4(x_fp4, block_scale, global_scale, N):
|
||||
M = x_fp4.shape[0]
|
||||
raw = x_fp4.view(torch.uint8)
|
||||
even = raw & 0x0F
|
||||
odd = (raw >> 4) & 0x0F
|
||||
indices = torch.stack([even, odd], dim=-1).reshape(M, N)
|
||||
signs = (indices >= 8).float() * -2 + 1
|
||||
mag = indices % 8
|
||||
idx_to_hs = torch.tensor([0,2,4,6,8,10,12,14], dtype=torch.float32, device='cuda')
|
||||
half_steps = idx_to_hs[mag.long()]
|
||||
x_deq = signs * half_steps / 2.0
|
||||
bs_exp = block_scale.repeat_interleave(16, dim=-1).float()
|
||||
x_deq = x_deq * bs_exp * global_scale
|
||||
return x_deq.to(torch.bfloat16)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
Reference in New Issue
Block a user