[Kernel] Replaced blockReduce[...] functions with cub::BlockReduce (#7233)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
@@ -7,7 +7,13 @@
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "../../reduction_utils.cuh"
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/util_type.cuh>
|
||||
#include <cub/cub.cuh>
|
||||
#else
|
||||
#include <hipcub/util_type.hpp>
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
using FP8_TYPE = c10::Float8_e4m3fn;
|
||||
@@ -215,7 +221,10 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
float const block_absmax_val_maybe = blockReduceMax(absmax_val);
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||
float const block_absmax_val_maybe =
|
||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||
__shared__ float token_scale;
|
||||
if (tid == 0) {
|
||||
if (scale_ub) {
|
||||
|
||||
Reference in New Issue
Block a user