[Kernel] fp4 marlin kernel (#17687)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-05-11 10:58:49 +08:00
parent ca66a1674c
commit d74e5f37bc
21 changed files with 1216 additions and 331 deletions
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -7,13 +7,14 @@
 #include "marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"

-#define MARLIN_KERNEL_PARAMS                                                 \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,                    \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                        \
-      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr,  \
-      const int *__restrict__ g_idx, int num_groups, int prob_m, int prob_n, \
-      int prob_k, int lda, int *locks, bool use_atomic_add,                  \
-      bool use_fp32_reduce, int max_shared_mem
+#define MARLIN_KERNEL_PARAMS                                                   \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ scales_ptr,                                     \
+      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
+      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
+      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16