[Kernel][Quantization] add w4a8 support for marlin kernel (#24722)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin@redhat.com>
2025-11-29 23:19:33 +08:00
parent fa59fe417f
commit 1656ad3704
46 changed files with 4371 additions and 2240 deletions
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -11,17 +11,19 @@
  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
      const int4 *__restrict__ b_bias_ptr,                                     \
+      const float *__restrict__ a_scales_ptr,                                  \
      const int4 *__restrict__ scales_ptr,                                     \
-      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
      int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the