[BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210)
Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
This commit is contained in:
committed by
GitHub
parent
160424a937
commit
86c3b5a808
@@ -107,7 +107,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
|
||||
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
|
||||
} else {
|
||||
out[inOffset] = out_val;
|
||||
int64_t outOffset =
|
||||
rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
out[outOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -140,7 +142,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
|
||||
int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
|
||||
int sf_n_unpadded = int(n / CVT_FP4_ELTS_PER_THREAD);
|
||||
|
||||
int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
|
||||
int grid_x = std::min(
|
||||
|
||||
Reference in New Issue
Block a user