[Attention] MLA with chunked prefill (#12639)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-21 18:30:12 -05:00
parent 900edbfa48
commit 288cc6c234
18 changed files with 1910 additions and 1275 deletions
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -2,10 +2,14 @@

 #include <stdio.h>

-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-  #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
-  #define DEVICE_INLINE __forceinline__ __device__
-  #define HOST_INLINE __forceinline__ __host__
+#if defined(__HIPCC__)
+  #define HOST_DEVICE_INLINE __host__ __device__
+  #define DEVICE_INLINE __device__
+  #define HOST_INLINE __host__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+  #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+  #define DEVICE_INLINE __device__ __forceinline__
+  #define HOST_INLINE __host__ __forceinline__
 #else
  #define HOST_DEVICE_INLINE inline
  #define DEVICE_INLINE inline
@@ -25,3 +29,13 @@
 int64_t get_device_attribute(int64_t attribute, int64_t device_id);

 int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+
+namespace cuda_utils {
+
+template <typename T>
+HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
+ceil_div(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+};  // namespace cuda_utils