Add Support for 2/3/8-bit GPTQ Quantization Models (#2330)

2024-02-29 13:52:23 +08:00
parent 929b4f2973
commit 01a5d18a53
8 changed files with 1663 additions and 156 deletions
--- a/csrc/quantization/gptq/matrix_view.cuh
+++ b/csrc/quantization/gptq/matrix_view.cuh
@@ -146,6 +146,129 @@ public:
    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
 };

+class MatrixView_q2_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+    }
+
+    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+        items[0] = d & 0x03;
+        items[1] = (d >> 2) & 0x03;
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+        items[0] = d & 0x03;
+        items[1] = (d >> 2) & 0x03;
+        items[2] = (d >> 4) & 0x03;
+        items[3] = (d >> 6) & 0x03;
+    }
+};
+
+class MatrixView_q3_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int z_w = column * 3 / 32;
+        int z_mod =  column & 0x1f;
+
+        if (z_mod == 10) {
+            return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+        } else if (z_mod == 21) {
+            return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+        } else if (z_mod < 10) {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+        } else if (z_mod < 21) {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 32)) & 0x07;
+        } else {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 64)) & 0x07;
+        }
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x1f);
+        uint32_t d;
+        if (shift <= 4) {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+        } else if (shift == 8) {
+            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+        } else if (shift <= 16) {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+        } else if (shift == 20) {
+            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+        } else {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+        }
+        items[0] = d & 0x07;
+        items[1] = (d >> 3) & 0x07;
+        items[2] = (d >> 6) & 0x07;
+        items[3] = (d >> 9) & 0x07;
+    }
+};
+
+class MatrixView_q8_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int shift = (column & 0x03) * 8;
+        return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+    }
+
+    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+    {
+        int shift = (column & 0x03) * 8;
+        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+        items[0] = d & 0xff;
+        items[1] = (d >> 8) & 0xff;
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x03) * 2;
+        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+        items[0] = d & 0xff;
+        items[1] = (d >> 8) & 0xff;
+        items[2] = (d >> 16) & 0xff;
+        items[3] = (d >> 24) & 0xff;
+    }
+};
+
 }  // namespace gptq
 }  // namespace vllm
 #endif