Fix CUDA kernel index data type in vllm/csrc/quantization/fused_kernels/layernorm_utils.cuh +10 (#15159)

Signed-off-by: Lu Fang <lufang@fb.com> Co-authored-by: Richard Barnes <rbarnes@meta.com>
2025-03-20 19:01:11 -07:00
parent 0cfe7d386d
commit d3ccbd6350
10 changed files with 124 additions and 124 deletions
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int n   = tid/32;
    const int l   = tid - 32*n;
    const int is  = 8*n + l/16;
@@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;

-    const int r = threadIdx.x/4;
+    const auto r = threadIdx.x/4;
    const int tid = r/2;
    const int is0 = r%2;
    const int l0 = 16*is0 + 4*(threadIdx.x%4);
@@ -164,10 +164,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 32 threads
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
    const int is  = 2*il;
@@ -197,10 +197,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il  = tid/16;   // il is in 0...3
    const int ir  = tid%16;   // ir is in 0...15
    const int is  = 2*il;     // is is in 0...6
@@ -231,10 +231,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int ip  = tid/32;   // ip is 0 or 1
    const int il  = tid - 32*ip; // 0...32
    const int is  = 8*ip + il/16;
@@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst

 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;

-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
        default:
            return nullptr;
    }
-}
+}
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -19,11 +19,11 @@ template <typename scalar_t>
 static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                     void* __restrict__ vy, const int kx,
                                     const int kx_padded) {
-  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
  if (ix >= kx_padded) {
    return;
  }
-  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
  const int i_padded = iy * kx_padded + ix;

  block_q8_1* y = (block_q8_1*)vy;
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(

    const int & ncols_dst = ncols_y;

-    const int row_dst_0 = blockIdx.x*mmq_y;
+    const auto row_dst_0 = blockIdx.x*mmq_y;
    const int & row_x_0 = row_dst_0;

-    const int col_dst_0 = blockIdx.y*mmq_x;
+    const auto col_dst_0 = blockIdx.y*mmq_x;
    const int & col_y_0 = col_dst_0;

    int   * tile_x_ql = nullptr;
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
            const int kbxd = kqs / QI8_1;

 #pragma unroll
@@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
 #pragma unroll
            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);

                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
@@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(

 #pragma unroll
    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
        if (col_dst >= ncols_dst) {
            return;
        }

 #pragma unroll
        for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
            if (row_dst >= nrows_dst) {
                continue;
            }
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -1,7 +1,7 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
 template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
        const int ibx = row*blocks_per_row + i; // x block index

        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(

  const int ncols_dst = ncols_y * top_k;

-  const int row_dst_0 = blockIdx.x * mmq_y;
+  const auto row_dst_0 = blockIdx.x * mmq_y;
  const int& row_x_0 = row_dst_0;

-  const int col_dst_0 = blockIdx.y * mmq_x;
+  const auto col_dst_0 = blockIdx.y * mmq_x;

  int token_offs[mmq_x / nwarps];
  for (int i = 0; i < mmq_x; i += nwarps) {
@@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
    const int n_per_r = ((qk * blocks_per_warp) / qr);
 #pragma unroll
    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
      const int kbxd = kqs / QI8_1;

 #pragma unroll
@@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
      }

      if (threadIdx.x < n_per_r / QK8_1) {
-        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
        const int col_y_eff = token_offs[threadIdx.y] / top_k;
        const int block_x =
            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
@@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(

 #pragma unroll
    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const int row_dst = row_dst_0 + threadIdx.x + i;
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
      if (row_dst >= nrows_dst) {
        continue;
      }