[ModelRunner V2] Don't pin reused flashinfer tensors (#32799)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -603,7 +603,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
"earlier GPUs."
|
"earlier GPUs."
|
||||||
)
|
)
|
||||||
# Preparing persistent buffers
|
# Preparing persistent buffers
|
||||||
self.pin_memory = is_pin_memory_available()
|
# Since we do not have explicit synchronization in ModelRunnerV2, we do not pin
|
||||||
|
# reused CPU buffers to avoid a race condition between step N async copies to
|
||||||
|
# GPU and step N+1 buffer updates.
|
||||||
|
self.pin_memory = (
|
||||||
|
not envs.VLLM_USE_V2_MODEL_RUNNER and is_pin_memory_available()
|
||||||
|
)
|
||||||
self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1)
|
self.paged_kv_indptr = self._make_buffer(max_num_reqs + 1)
|
||||||
self.paged_kv_indptr_cpu_buffer = torch.zeros_like(
|
self.paged_kv_indptr_cpu_buffer = torch.zeros_like(
|
||||||
self.paged_kv_indptr.cpu, pin_memory=self.pin_memory
|
self.paged_kv_indptr.cpu, pin_memory=self.pin_memory
|
||||||
|
|||||||
Reference in New Issue
Block a user