Apply fixes for CUDA 13 (#24599)

Signed-off-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>
2025-09-17 17:15:42 +04:00
parent 9fccd04e30
commit bfe9380161
8 changed files with 47 additions and 56 deletions
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -1,15 +1,10 @@
 #include "common.cuh"
 #include "dispatch_utils.h"
+#include "../../cub_helpers.h"
 #include "../vectorization_utils.cuh"
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/Exceptions.h>

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif
-
 namespace vllm {

 template <typename scalar_t, typename fp8_type>
@@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
  using BlockReduce = cub::BlockReduce<float, 256>;
  __shared__ typename BlockReduce::TempStorage tmp;
  const float block_max =
-      BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
+      BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x);

  __shared__ float token_scale;
  if (tid == 0) {