[CI/Build] Enforce style for C++ and CUDA code with clang-format (#4722)
This commit is contained in:
@@ -25,32 +25,28 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
|
||||
namespace vllm {
|
||||
namespace aqlm {
|
||||
|
||||
__global__ void Code1x16MatVec(
|
||||
const int4* __restrict__ A,
|
||||
const int4* __restrict__ B,
|
||||
int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook,
|
||||
const int prob_m,
|
||||
const int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
const int4* __restrict__ A, const int4* __restrict__ B,
|
||||
int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
|
||||
const int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
|
||||
// codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
) {
|
||||
int a_gl_stride = prob_k / 8 / 8;
|
||||
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
|
||||
bool pred = a_gl_rd < prob_m;
|
||||
|
||||
if (pred)
|
||||
{
|
||||
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
|
||||
if (pred) {
|
||||
// advance to the correct codebook, this easy because we only multiply one
|
||||
// column of the codebook.
|
||||
auto codebook_size = &codebook_a_sizes.x;
|
||||
while (a_gl_rd >= *codebook_size)
|
||||
{
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
while (a_gl_rd >= *codebook_size) {
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,8 +63,7 @@ __global__ void Code1x16MatVec(
|
||||
// We pad shared memory to avoid bank conflicts during reads
|
||||
__syncthreads();
|
||||
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
|
||||
if (b_gl_rd + i < prob_k / 8)
|
||||
sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
|
||||
if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
|
||||
}
|
||||
__syncthreads();
|
||||
b_gl_rd += 32 * 8;
|
||||
@@ -76,22 +71,19 @@ __global__ void Code1x16MatVec(
|
||||
int b_sh_rd = 9 * (threadIdx.x % 32);
|
||||
if (pred && a_gl_rd < a_gl_end) {
|
||||
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
uint32_t dec[4];
|
||||
// We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
|
||||
// actually help us; this brings > 2x speedup.
|
||||
asm volatile (
|
||||
"ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
|
||||
: "l"((void*) &codebook[enc[i]])
|
||||
);
|
||||
// We bypass the L1 cache to avoid massive amounts of memory streaming
|
||||
// that doesn't actually help us; this brings > 2x speedup.
|
||||
asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
|
||||
: "l"((void*)&codebook[enc[i]]));
|
||||
half2* a = reinterpret_cast<half2*>(&dec);
|
||||
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
|
||||
half2 res2 = {};
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++)
|
||||
res2 = __hfma2(a[j], b[j], res2);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
|
||||
res += __half2float(res2.x) + __half2float(res2.y);
|
||||
b_sh_rd++;
|
||||
}
|
||||
@@ -100,37 +92,33 @@ __global__ void Code1x16MatVec(
|
||||
}
|
||||
|
||||
if (pred) {
|
||||
#pragma unroll
|
||||
for (int i = 16; i > 0; i /= 2)
|
||||
res += __shfl_down_sync(0xffffffff, res, i);
|
||||
#pragma unroll
|
||||
for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
|
||||
if (threadIdx.x % 32 == 0)
|
||||
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Code2x8MatVec(
|
||||
const int4* __restrict__ A,
|
||||
const int4* __restrict__ B,
|
||||
int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
const int4* __restrict__ A, const int4* __restrict__ B,
|
||||
int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
|
||||
// codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
|
||||
) {
|
||||
int a_gl_stride = prob_k / 8 / 8;
|
||||
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
|
||||
bool pred = a_gl_rd < prob_m;
|
||||
|
||||
if (pred)
|
||||
{
|
||||
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
|
||||
if (pred) {
|
||||
// advance to the correct codebook, this easy because we only multiply one
|
||||
// column of the codebook.
|
||||
auto codebook_size = &codebook_a_sizes.x;
|
||||
while (a_gl_rd >= *codebook_size)
|
||||
{
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
while (a_gl_rd >= *codebook_size) {
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,9 +136,8 @@ __global__ void Code2x8MatVec(
|
||||
|
||||
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
|
||||
int4 dec = codebook[i];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; j++)
|
||||
sh_code[8 * i + (j + lane) % 8] = dec;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -161,8 +148,7 @@ __global__ void Code2x8MatVec(
|
||||
// We pad shared memory to avoid bank conflicts during reads
|
||||
__syncthreads();
|
||||
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
|
||||
if (b_gl_rd + i < prob_k / 8)
|
||||
sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
|
||||
if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
|
||||
}
|
||||
__syncthreads();
|
||||
b_gl_rd += 32 * 8;
|
||||
@@ -170,13 +156,15 @@ __global__ void Code2x8MatVec(
|
||||
int b_sh_rd = 9 * (threadIdx.x % 32);
|
||||
if (pred && a_gl_rd < a_gl_end) {
|
||||
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
|
||||
half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
|
||||
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
|
||||
half2* a0 =
|
||||
reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
|
||||
half2* a1 =
|
||||
reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
|
||||
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
|
||||
half2 res2 = {};
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++)
|
||||
res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
|
||||
res += __half2float(res2.x) + __half2float(res2.y);
|
||||
@@ -187,36 +175,31 @@ __global__ void Code2x8MatVec(
|
||||
}
|
||||
|
||||
if (pred) {
|
||||
#pragma unroll
|
||||
for (int i = 16; i > 0; i /= 2)
|
||||
res += __shfl_down_sync(0xffffffff, res, i);
|
||||
#pragma unroll
|
||||
for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
|
||||
if (threadIdx.x % 32 == 0)
|
||||
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Code1x16Dequant(
|
||||
const int4* __restrict__ A,
|
||||
int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
|
||||
const int codebook_stride // as int4
|
||||
const int4* __restrict__ A, int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook, int prob_m, int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
|
||||
// codebook, at most 3 long, sums to m.
|
||||
const int codebook_stride // as int4
|
||||
) {
|
||||
int a_gl_stride = prob_k / 8 / 8;
|
||||
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
|
||||
bool pred = a_gl_rd < prob_m;
|
||||
|
||||
if (pred)
|
||||
{
|
||||
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
|
||||
if (pred) {
|
||||
// advance to the correct codebook, this easy because we only multiply one
|
||||
// column of the codebook.
|
||||
auto codebook_size = &codebook_a_sizes.x;
|
||||
while (a_gl_rd >= *codebook_size)
|
||||
{
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
while (a_gl_rd >= *codebook_size) {
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -231,17 +214,15 @@ __global__ void Code1x16Dequant(
|
||||
while (iters--) {
|
||||
if (pred && a_gl_rd < a_gl_end) {
|
||||
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int4 chunk;
|
||||
auto dec = reinterpret_cast<uint32_t*>(&chunk);
|
||||
// We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
|
||||
// actually help us; this brings > 2x speedup.
|
||||
asm volatile (
|
||||
"ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
|
||||
: "l"((void*) &codebook[enc[i]])
|
||||
);
|
||||
// We bypass the L1 cache to avoid massive amounts of memory streaming
|
||||
// that doesn't actually help us; this brings > 2x speedup.
|
||||
asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
|
||||
: "l"((void*)&codebook[enc[i]]));
|
||||
|
||||
C[a_gl_rd * 8 + i] = chunk;
|
||||
}
|
||||
@@ -250,28 +231,25 @@ __global__ void Code1x16Dequant(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Code2x8Dequant(
|
||||
const int4* __restrict__ A,
|
||||
int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
|
||||
const int codebook_stride // as int4
|
||||
const int4* __restrict__ A, int4* __restrict__ C,
|
||||
const int4* __restrict__ codebook, int prob_m, int prob_k,
|
||||
const int4
|
||||
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
|
||||
// most 3 long, corresponds to cols.
|
||||
const int codebook_stride // as int4
|
||||
) {
|
||||
int a_gl_stride = prob_k / 8 / 8;
|
||||
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
|
||||
bool pred = a_gl_rd < prob_m;
|
||||
|
||||
if (pred)
|
||||
{
|
||||
// advance to the correct codebook, this easy because we only multiply one column of the codebook.
|
||||
if (pred) {
|
||||
// advance to the correct codebook, this easy because we only multiply one
|
||||
// column of the codebook.
|
||||
auto codebook_size = &codebook_a_sizes.x;
|
||||
while (a_gl_rd >= *codebook_size)
|
||||
{
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
while (a_gl_rd >= *codebook_size) {
|
||||
codebook += codebook_stride;
|
||||
++codebook_size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -290,9 +268,8 @@ __global__ void Code2x8Dequant(
|
||||
|
||||
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
|
||||
int4 dec = codebook[i];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; j++)
|
||||
sh_code[8 * i + (j + lane) % 8] = dec;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -302,12 +279,14 @@ __global__ void Code2x8Dequant(
|
||||
while (iters--) {
|
||||
if (pred && a_gl_rd < a_gl_end) {
|
||||
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int4 chunk;
|
||||
half2* a0 = reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
|
||||
half2* a1 = reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
|
||||
#pragma unroll
|
||||
half2* a0 =
|
||||
reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
|
||||
half2* a1 =
|
||||
reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++)
|
||||
reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
|
||||
C[a_gl_rd * 8 + i] = chunk;
|
||||
@@ -317,22 +296,15 @@ __global__ void Code2x8Dequant(
|
||||
}
|
||||
}
|
||||
|
||||
inline int ceildiv(int a, int b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
const int THREAD_M = 16;
|
||||
|
||||
void code1x16_matvec_cuda(
|
||||
const void* __restrict__ A,
|
||||
const void* __restrict__ B,
|
||||
void* __restrict__ C,
|
||||
const void* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes,
|
||||
const int codebook_stride
|
||||
) {
|
||||
void code1x16_matvec_cuda(const void* __restrict__ A,
|
||||
const void* __restrict__ B, void* __restrict__ C,
|
||||
const void* __restrict__ codebook, int prob_m,
|
||||
int prob_k, const int4 codebook_a_sizes,
|
||||
const int codebook_stride) {
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
|
||||
int waves = 0;
|
||||
@@ -345,28 +317,16 @@ void code1x16_matvec_cuda(
|
||||
int blocks = ceildiv(prob_m, thread_m);
|
||||
int threads = 32 * thread_m;
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
Code1x16MatVec<<<blocks, threads, 16*32*9, stream>>>(
|
||||
(const int4*) A,
|
||||
(const int4*) B,
|
||||
(int4*) C,
|
||||
(const int4*) codebook,
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes,
|
||||
codebook_stride
|
||||
);
|
||||
Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
|
||||
(const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
|
||||
prob_k, codebook_a_sizes, codebook_stride);
|
||||
}
|
||||
|
||||
void code2x8_matvec_cuda(
|
||||
const void* __restrict__ A,
|
||||
const void* __restrict__ B,
|
||||
void* __restrict__ C,
|
||||
const void* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes,
|
||||
const int codebook_stride
|
||||
) {
|
||||
void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
|
||||
void* __restrict__ C,
|
||||
const void* __restrict__ codebook, int prob_m,
|
||||
int prob_k, const int4 codebook_a_sizes,
|
||||
const int codebook_stride) {
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
|
||||
int waves = 0;
|
||||
@@ -379,30 +339,20 @@ void code2x8_matvec_cuda(
|
||||
int blocks = ceildiv(prob_m, thread_m);
|
||||
int threads = 32 * thread_m;
|
||||
int shared = 16 * (2 * 256 * 8 + 32 * 9);
|
||||
cudaFuncSetAttribute(
|
||||
Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
|
||||
);
|
||||
cudaFuncSetAttribute(Code2x8MatVec,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
Code2x8MatVec<<<blocks, threads, shared, stream>>>(
|
||||
(const int4*) A,
|
||||
(const int4*) B,
|
||||
(int4*) C,
|
||||
(const int4*) codebook,
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes,
|
||||
codebook_stride
|
||||
);
|
||||
(const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
|
||||
prob_k, codebook_a_sizes, codebook_stride);
|
||||
}
|
||||
|
||||
void code1x16_dequant_cuda(
|
||||
const void* __restrict__ A,
|
||||
void* __restrict__ C,
|
||||
const void* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
const void* __restrict__ A, void* __restrict__ C,
|
||||
const void* __restrict__ codebook, int prob_m, int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
|
||||
// codebook, at most 3 long.
|
||||
const int codebook_stride // as int4.
|
||||
) {
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
|
||||
@@ -417,25 +367,21 @@ void code1x16_dequant_cuda(
|
||||
int threads = 32 * thread_m;
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
Code1x16Dequant<<<blocks, threads, 0, stream>>>(
|
||||
(const int4*) A,
|
||||
(int4*) C,
|
||||
(const int4*) codebook,
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
|
||||
codebook_stride // as int4.
|
||||
(const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
|
||||
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
|
||||
// most 3 long.
|
||||
codebook_stride // as int4.
|
||||
);
|
||||
}
|
||||
|
||||
// Dequantizes the code and codebook into weights.
|
||||
void code2x8_dequant_cuda(
|
||||
const void* __restrict__ A,
|
||||
void* __restrict__ C,
|
||||
const void* __restrict__ codebook,
|
||||
int prob_m,
|
||||
int prob_k,
|
||||
const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
|
||||
const int codebook_stride // as int4
|
||||
void code2x8_dequant_cuda(
|
||||
const void* __restrict__ A, void* __restrict__ C,
|
||||
const void* __restrict__ codebook, int prob_m, int prob_k,
|
||||
const int4
|
||||
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
|
||||
// most 3 long, corresponds to cols.
|
||||
const int codebook_stride // as int4
|
||||
) {
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
|
||||
@@ -451,74 +397,50 @@ void code2x8_dequant_cuda(
|
||||
int shared = 16 * (2 * 256 * 8 + 32 * 9);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
|
||||
cudaFuncSetAttribute(
|
||||
Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
|
||||
);
|
||||
cudaFuncSetAttribute(Code2x8Dequant,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
|
||||
Code2x8Dequant<<<blocks, threads, shared, stream>>>(
|
||||
(const int4*) A,
|
||||
(int4*) C,
|
||||
(const int4*) codebook,
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes,
|
||||
codebook_stride
|
||||
);
|
||||
(const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
|
||||
codebook_a_sizes, codebook_stride);
|
||||
}
|
||||
|
||||
int codebook_stride(const torch::Tensor& codebooks)
|
||||
{
|
||||
int codebook_stride(const torch::Tensor& codebooks) {
|
||||
return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
|
||||
}
|
||||
|
||||
void code1x16_matvec(
|
||||
const torch::Tensor& A,
|
||||
const torch::Tensor& B,
|
||||
torch::Tensor& C,
|
||||
const torch::Tensor& codebook,
|
||||
const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long.
|
||||
const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
|
||||
const torch::Tensor& codebook,
|
||||
const int4 codebook_a_sizes // cumulative sizes of A spanning each
|
||||
// codebook, at most 3 long.
|
||||
) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
|
||||
int prob_m = C.size(0);
|
||||
int prob_k = B.size(0);
|
||||
|
||||
code1x16_matvec_cuda(
|
||||
A.data_ptr(),
|
||||
B.data_ptr(),
|
||||
C.data_ptr(),
|
||||
codebook.data_ptr(),
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes,
|
||||
codebook_stride(codebook)
|
||||
);
|
||||
code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
|
||||
codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
|
||||
codebook_stride(codebook));
|
||||
}
|
||||
|
||||
torch::Tensor code1x16_matmat(
|
||||
const torch::Tensor& input,
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const int4 codebook_a_sizes,
|
||||
const std::optional<torch::Tensor>& bias) {
|
||||
torch::Tensor code1x16_matmat(const torch::Tensor& input,
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const int4 codebook_a_sizes,
|
||||
const std::optional<torch::Tensor>& bias) {
|
||||
auto input_sizes = input.sizes();
|
||||
auto out_features = codes.size(0) * codebooks.size(2);
|
||||
auto flat_input = input.reshape({-1, input.size(-1)});
|
||||
auto flat_output = torch::empty({flat_input.size(0), out_features},
|
||||
torch::TensorOptions()
|
||||
.dtype(input.dtype())
|
||||
.device(input.device())
|
||||
);
|
||||
auto flat_output = torch::empty(
|
||||
{flat_input.size(0), out_features},
|
||||
torch::TensorOptions().dtype(input.dtype()).device(input.device()));
|
||||
|
||||
for (int i = 0; i < flat_input.size(0); ++i) {
|
||||
auto input_vec = flat_input.index({i});
|
||||
auto output_vec = flat_output.index({i});
|
||||
code1x16_matvec(
|
||||
codes.squeeze(2),
|
||||
input_vec,
|
||||
output_vec,
|
||||
codebooks,
|
||||
codebook_a_sizes
|
||||
);
|
||||
code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
|
||||
codebook_a_sizes);
|
||||
}
|
||||
flat_output *= scales.flatten().unsqueeze(0);
|
||||
|
||||
@@ -533,55 +455,35 @@ torch::Tensor code1x16_matmat(
|
||||
return output;
|
||||
}
|
||||
|
||||
void code2x8_matvec(
|
||||
const torch::Tensor& A,
|
||||
const torch::Tensor& B,
|
||||
torch::Tensor& C,
|
||||
const torch::Tensor& codebook,
|
||||
const int4 codebook_a_sizes
|
||||
) {
|
||||
void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
|
||||
torch::Tensor& C, const torch::Tensor& codebook,
|
||||
const int4 codebook_a_sizes) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
|
||||
int prob_m = C.size(0);
|
||||
int prob_k = B.size(0);
|
||||
code2x8_matvec_cuda(
|
||||
A.data_ptr(),
|
||||
B.data_ptr(),
|
||||
C.data_ptr(),
|
||||
codebook.data_ptr(),
|
||||
prob_m,
|
||||
prob_k,
|
||||
codebook_a_sizes,
|
||||
2 * codebook_stride(codebook)
|
||||
);
|
||||
code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
|
||||
codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
|
||||
2 * codebook_stride(codebook));
|
||||
}
|
||||
|
||||
torch::Tensor code2x8_matmat(
|
||||
const torch::Tensor& input,
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const int4 codebook_a_sizes,
|
||||
const std::optional<torch::Tensor>& bias
|
||||
) {
|
||||
torch::Tensor code2x8_matmat(const torch::Tensor& input,
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const int4 codebook_a_sizes,
|
||||
const std::optional<torch::Tensor>& bias) {
|
||||
auto input_sizes = input.sizes();
|
||||
auto out_features = codes.size(0) * codebooks.size(2);
|
||||
auto flat_input = input.reshape({-1, input.size(-1)});
|
||||
auto flat_output = torch::empty({flat_input.size(0), out_features},
|
||||
torch::TensorOptions()
|
||||
.dtype(input.dtype())
|
||||
.device(input.device())
|
||||
);
|
||||
auto flat_output = torch::empty(
|
||||
{flat_input.size(0), out_features},
|
||||
torch::TensorOptions().dtype(input.dtype()).device(input.device()));
|
||||
|
||||
for (int i = 0; i < flat_input.size(0); ++i) {
|
||||
auto input_vec = flat_input.index({i});
|
||||
auto output_vec = flat_output.index({i});
|
||||
code2x8_matvec(
|
||||
codes.squeeze(2),
|
||||
input_vec,
|
||||
output_vec,
|
||||
codebooks,
|
||||
codebook_a_sizes
|
||||
);
|
||||
code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
|
||||
codebook_a_sizes);
|
||||
}
|
||||
flat_output *= scales.flatten().unsqueeze(0);
|
||||
if (bias.has_value()) {
|
||||
@@ -596,64 +498,56 @@ torch::Tensor code2x8_matmat(
|
||||
}
|
||||
|
||||
// Accumulate the partition sizes.
|
||||
int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
|
||||
{
|
||||
int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) {
|
||||
int4 cumulative_sizes;
|
||||
auto cumulative_size = &cumulative_sizes.x;
|
||||
int i = 0;
|
||||
int last = 0;
|
||||
assert(codebook_partition_sizes.size(0) <= 4);
|
||||
for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size)
|
||||
{
|
||||
for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) {
|
||||
*cumulative_size = codebook_partition_sizes[i].item<int>() + last;
|
||||
last = *cumulative_size;
|
||||
}
|
||||
// fill in the rest with unreachable.
|
||||
for (; i < 4; ++i, ++cumulative_size)
|
||||
{
|
||||
*cumulative_size = last*10;
|
||||
for (; i < 4; ++i, ++cumulative_size) {
|
||||
*cumulative_size = last * 10;
|
||||
}
|
||||
return cumulative_sizes;
|
||||
}
|
||||
|
||||
} // namespace aqlm
|
||||
} // namespace vllm
|
||||
} // namespace aqlm
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
torch::Tensor aqlm_gemm(
|
||||
const torch::Tensor& input,
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const torch::Tensor& codebook_partition_sizes,
|
||||
const std::optional<torch::Tensor>& bias
|
||||
)
|
||||
{
|
||||
int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
|
||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& scales,
|
||||
const torch::Tensor& codebook_partition_sizes,
|
||||
const std::optional<torch::Tensor>& bias) {
|
||||
int4 cumulative_sizes =
|
||||
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
|
||||
|
||||
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
|
||||
int const entries = codebooks.size(1);
|
||||
|
||||
if (nbooks == 1 && entries == (1 << 16))
|
||||
{
|
||||
return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
|
||||
if (nbooks == 1 && entries == (1 << 16)) {
|
||||
return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
|
||||
cumulative_sizes, bias);
|
||||
}
|
||||
if (nbooks == 2 && entries == (1 << 8))
|
||||
{
|
||||
return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
|
||||
if (nbooks == 2 && entries == (1 << 8)) {
|
||||
return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
|
||||
cumulative_sizes, bias);
|
||||
}
|
||||
|
||||
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
|
||||
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
|
||||
" entries is not currently supported.")
|
||||
return {};
|
||||
}
|
||||
|
||||
torch::Tensor aqlm_dequant(
|
||||
const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& codebook_partition_sizes
|
||||
)
|
||||
{
|
||||
int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
|
||||
torch::Tensor aqlm_dequant(const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
const torch::Tensor& codebook_partition_sizes) {
|
||||
int4 cumulative_sizes =
|
||||
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
|
||||
|
||||
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
|
||||
int const entries = codebooks.size(1);
|
||||
@@ -668,45 +562,37 @@ torch::Tensor aqlm_dequant(
|
||||
assert(out_features = codebook_partition_sizes.sum().item<int>());
|
||||
|
||||
auto weights = torch::empty({out_features, in_features},
|
||||
torch::TensorOptions()
|
||||
.dtype(codebooks.dtype())
|
||||
.device(codebooks.device())
|
||||
);
|
||||
torch::TensorOptions()
|
||||
.dtype(codebooks.dtype())
|
||||
.device(codebooks.device()));
|
||||
|
||||
if (nbooks == 1 && entries == (1 << 16))
|
||||
{
|
||||
vllm::aqlm::code1x16_dequant_cuda(
|
||||
codes.data_ptr(),
|
||||
weights.data_ptr(),
|
||||
codebooks.data_ptr(),
|
||||
out_features,
|
||||
in_features,
|
||||
cumulative_sizes,
|
||||
vllm::aqlm::codebook_stride(codebooks));
|
||||
if (nbooks == 1 && entries == (1 << 16)) {
|
||||
vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
|
||||
codebooks.data_ptr(), out_features,
|
||||
in_features, cumulative_sizes,
|
||||
vllm::aqlm::codebook_stride(codebooks));
|
||||
|
||||
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
|
||||
// weights *= scales.index({"...", 0, 0});
|
||||
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower
|
||||
// and not consistent with gemv implementation.) weights *=
|
||||
// scales.index({"...", 0, 0});
|
||||
|
||||
return weights;
|
||||
return weights;
|
||||
}
|
||||
|
||||
if (nbooks == 2 && entries == (1 << 8))
|
||||
{
|
||||
vllm::aqlm::code2x8_dequant_cuda(
|
||||
codes.data_ptr(),
|
||||
weights.data_ptr(),
|
||||
codebooks.data_ptr(),
|
||||
out_features,
|
||||
in_features,
|
||||
cumulative_sizes,
|
||||
vllm::aqlm::codebook_stride(codebooks));
|
||||
if (nbooks == 2 && entries == (1 << 8)) {
|
||||
vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
|
||||
codebooks.data_ptr(), out_features,
|
||||
in_features, cumulative_sizes,
|
||||
vllm::aqlm::codebook_stride(codebooks));
|
||||
|
||||
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
|
||||
// weights *= scales.index({"...", 0, 0});
|
||||
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower
|
||||
// and not consistent with gemv implementation) weights *=
|
||||
// scales.index({"...", 0, 0});
|
||||
|
||||
return weights;
|
||||
return weights;
|
||||
}
|
||||
|
||||
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
|
||||
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
|
||||
" entries is not currently supported.")
|
||||
return {};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user