[Core] Always use tensor cores for Flashinfer Decode Wrapper (#23214)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@@ -136,9 +136,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
||||
|
||||
# Baseline Decode
|
||||
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
||||
workspace_buffer,
|
||||
kv_layout,
|
||||
use_tensor_cores=((num_qo_heads // num_kv_heads) > 4))
|
||||
workspace_buffer, kv_layout, use_tensor_cores=True)
|
||||
wrapper.plan(kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
|
||||
Reference in New Issue
Block a user