[Attention] MLA with chunked prefill (#12639)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
@@ -2,10 +2,14 @@
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
|
||||
#define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
|
||||
#define DEVICE_INLINE __forceinline__ __device__
|
||||
#define HOST_INLINE __forceinline__ __host__
|
||||
#if defined(__HIPCC__)
|
||||
#define HOST_DEVICE_INLINE __host__ __device__
|
||||
#define DEVICE_INLINE __device__
|
||||
#define HOST_INLINE __host__
|
||||
#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
|
||||
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
|
||||
#define DEVICE_INLINE __device__ __forceinline__
|
||||
#define HOST_INLINE __host__ __forceinline__
|
||||
#else
|
||||
#define HOST_DEVICE_INLINE inline
|
||||
#define DEVICE_INLINE inline
|
||||
@@ -25,3 +29,13 @@
|
||||
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
|
||||
|
||||
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
|
||||
|
||||
namespace cuda_utils {
|
||||
|
||||
template <typename T>
|
||||
HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
|
||||
ceil_div(T a, T b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
}; // namespace cuda_utils
|
||||
Reference in New Issue
Block a user