Support bfloat16 data type (#54)

This commit is contained in:
Woosuk Kwon
2023-05-03 14:09:44 -07:00
committed by GitHub
parent 436e523bf1
commit e070829ae8
12 changed files with 455 additions and 53 deletions

View File

@@ -1,7 +1,7 @@
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include "attention_dtypes.cuh"
#include "attention_dtypes.h"
#include "attention_utils.cuh"
#include <algorithm>
@@ -438,9 +438,13 @@ void single_query_cached_kv_attention(
torch::Tensor& context_lens, // [num_seqs]
int block_size,
int max_context_len) {
// TODO(woosuk): Support FP32 and BF16.
// TODO(woosuk): Support FP32.
if (query.dtype() == at::ScalarType::Half) {
CALL_KERNEL_LAUNCHER_BLOCK_SIZE(uint16_t);
#ifdef ENABLE_BF16
} else if (query.dtype() == at::ScalarType::BFloat16) {
CALL_KERNEL_LAUNCHER_BLOCK_SIZE(__nv_bfloat16);
#endif
} else {
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
}