[V1] LoRA - Add triton kernels for V1 (#13096)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-10 17:27:53 -04:00
parent 0967110e42
commit 5ff0d32580
11 changed files with 1165 additions and 191 deletions
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -62,9 +62,9 @@ class LoRAModelRunnerMixin:
        if not self.lora_manager:
            raise RuntimeError("LoRA is not enabled.")

-        # We dont make any distinction between prefills and decodes in the
-        # scheduler. To that effect, set is_prefill to True so we use the
-        # sgmv punica kernels always.
+        # Set is_prefill to True, so we always use the SGMV kernels.
+        # For cuda platforms, we have specialized triton kernels, and
+        # the cuda path ignores `is_prefill`.
        lora_mapping = LoRAMapping(token_lora_mapping,
                                   prompt_lora_mapping,
                                   is_prefill=True)