[Misc] Add multipstep chunked-prefill support for FlashInfer (#10467)

This commit is contained in:
Elfie Guo
2025-01-14 20:47:49 -08:00
committed by GitHub
parent b7ee940a82
commit 0794e7446e
5 changed files with 162 additions and 102 deletions

View File

@@ -95,6 +95,16 @@ __global__ void advance_step_flashinfer_kernel(
long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
int const* block_tables_ptr, int64_t const block_tables_stride,
int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
int const n_pad = num_seqs - num_queries;
if (n_pad && blockIdx.x == 0) {
// Handle cuda graph padding
int const offset = num_queries;
for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
input_tokens_ptr[offset + i] = 0;
input_positions_ptr[offset + i] = 0;
slot_mapping_ptr[offset + i] = -1;
}
}
int num_query_blocks = div_ceil(num_queries, num_threads);
if (blockIdx.x < num_query_blocks) {