[Bugfix] LoRA V0 - Fix case where max_num_seqs is between cudagraph capture sizes (#15308)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
2fa0e1396b
commit
8a8b30eac1
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
@@ -42,8 +43,15 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
||||
self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
||||
max_num_batched_tokens,
|
||||
device=device)
|
||||
|
||||
# When cudagraph capture size is greater than max_num_seqs (max_batches,
|
||||
# here), V0 captures the graph as if max_num_seqs is set to
|
||||
# the capture size.
|
||||
# V1 doesn't have this problem and always respects max_num_seqs.
|
||||
max_num_prompts = (max_batches
|
||||
if envs.VLLM_USE_V1 else max_num_batched_tokens)
|
||||
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
||||
max_batches,
|
||||
max_num_prompts,
|
||||
device=device)
|
||||
|
||||
def update_metadata(
|
||||
|
||||
Reference in New Issue
Block a user