[Kernel] Add more dtype support for GGUF dequantization (#15879)

Signed-off-by: lukas.bluebaum <lukas.bluebaum@aleph-alpha.com>
This commit is contained in:
LukasBluebaum
2025-04-02 10:58:48 +02:00
committed by GitHub
parent 101f1481f9
commit 90969fb39a
9 changed files with 80 additions and 50 deletions

View File

@@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
}
torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight
int64_t type, int64_t m, int64_t n) {
int64_t type, int64_t m, int64_t n,
std::optional<at::ScalarType> const& dtype) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
auto options =
torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
auto dtype_ = dtype.value_or(torch::kFloat16);
auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
at::Tensor DW = torch::empty({m, n}, options);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
});
return DW;
}