From 0fefadedd40c25ee59f9cc287aeaf82883440c0e Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 2 Jun 2026 09:46:32 +0000 Subject: [PATCH] KV-1: Fix FP8 round-trip mismatch in fused quantize CRITICAL: quantize must use the FP8-round-tripped block scale, not the raw pre-FP8 value. The dequant reads the FP8 bytes back, so the quantize must match exactly. Same pattern as quantize_nvfp4.cu. This was the root cause of cos=0.925 (should be ~0.995). --- dsv4/kernels/cuda/compressor_reduce_quant.cu | 25 +++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/dsv4/kernels/cuda/compressor_reduce_quant.cu b/dsv4/kernels/cuda/compressor_reduce_quant.cu index 3792cc9b..797471d7 100644 --- a/dsv4/kernels/cuda/compressor_reduce_quant.cu +++ b/dsv4/kernels/cuda/compressor_reduce_quant.cu @@ -104,12 +104,20 @@ __global__ void csa_compress_reduce_quant_kernel( int base=b*16; float ba=0; for(int i=0;i<16;i++){int c=base+i;if(c12)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} - if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} + if(!z&&c012)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} + if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} out_fp4[bi*(hd/2)+b*8+i]=(hi<<4)|lo; } } @@ -156,12 +164,13 @@ __global__ void hca_compress_reduce_quant_kernel( int base=b*16; float ba=0;for(int i=0;i<16;i++){int c=base+i;if(c12)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} - if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} + if(!z&&c012)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} + if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} out_fp4[bi*(hd/2)+b*8+i]=(hi<<4)|lo; } }