[Kernel] LoRA - Enable CUDAGraphs for V1 (#14626)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-13 23:42:04 -04:00
parent 32ef4983cd
commit 0b1cfa6180
4 changed files with 35 additions and 14 deletions
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -254,7 +254,9 @@ class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin):
        y_org = y
        y = y.view(-1, y.shape[-1])
        if lora_bias_stacked is not None:
-            self._apply_bias(self.token_lora_indices, y, output_slices,
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            self._apply_bias(token_lora_indices, y, output_slices,
                             lora_bias_stacked)

        if env.VLLM_USE_V1:
@@ -365,7 +367,9 @@ class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin):
        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
        if lora_bias_stacked is not None:
            assert len(lora_bias_stacked) == len(output_slices)
-            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            y = self._apply_bias(token_lora_indices, y, output_slices,
                                 lora_bias_stacked)

        if buffer is None: