diff --git a/dsv4/kernels/cuda/compressor_reduce_quant.cu b/dsv4/kernels/cuda/compressor_reduce_quant.cu index 3792cc9b..797471d7 100644 --- a/dsv4/kernels/cuda/compressor_reduce_quant.cu +++ b/dsv4/kernels/cuda/compressor_reduce_quant.cu @@ -104,12 +104,20 @@ __global__ void csa_compress_reduce_quant_kernel( int base=b*16; float ba=0; for(int i=0;i<16;i++){int c=base+i;if(c12)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} - if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} + if(!z&&c012)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} + if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} out_fp4[bi*(hd/2)+b*8+i]=(hi<<4)|lo; } } @@ -156,12 +164,13 @@ __global__ void hca_compress_reduce_quant_kernel( int base=b*16; float ba=0;for(int i=0;i<16;i++){int c=base+i;if(c12)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} - if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} + if(!z&&c012)hs=12;lo=half_step_to_e2m1(hs);if(s<0)lo+=8;} + if(!z&&c112)hs=12;hi=half_step_to_e2m1(hs);if(s<0)hi+=8;} out_fp4[bi*(hd/2)+b*8+i]=(hi<<4)|lo; } }