[Perf] Optimize Vectorization Utils for Int 8 Quantization Kernels (#20331)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2025-07-04 03:06:24 -04:00
committed by GitHub
parent 4a98edff1f
commit 783921d889
2 changed files with 106 additions and 7 deletions

View File

@@ -162,10 +162,11 @@ __global__ void dynamic_scaled_int8_quant_kernel(
// calculate for absmax
float thread_max = 0.f;
for (int i = tid; i < hidden_size; i += stride) {
const auto v = fabsf(static_cast<float>(row_in[i]));
thread_max = fmaxf(thread_max, v);
}
vectorize_read_with_alignment<16>(
row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) {
const float v = fabsf(static_cast<float>(src));
thread_max = fmaxf(thread_max, v);
});
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
@@ -232,9 +233,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
// 1. calculate min & max
MinMax thread_mm;
for (int i = tid; i < hidden_size; i += stride) {
thread_mm += static_cast<float>(row_in[i]);
}
vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride,
[&] __device__(const scalar_t& src) {
thread_mm += static_cast<float>(src);
});
using BlockReduce = cub::BlockReduce<MinMax, 256>;
__shared__ typename BlockReduce::TempStorage tmp;