[LoRA]Disable linear LoRA kernel PDL (#31777)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -277,7 +277,7 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
|
||||
|
||||
## LoRA Support for Tower and Connector of Multi-Modal Model
|
||||
|
||||
Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector.
|
||||
Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. Please refer to [Issue 31479](https://github.com/vllm-project/vllm/issues/31479) to check the current model support status.
|
||||
|
||||
## Default LoRA Models For Multimodal Models
|
||||
|
||||
|
||||
@@ -231,9 +231,9 @@ def _fused_moe_lora_shrink(
|
||||
num_stages: int,
|
||||
split_k: int,
|
||||
mul_routed_weight: bool = False,
|
||||
use_gdc: bool = False,
|
||||
) -> None:
|
||||
w1_lora_a_stacked = lora_a_stacked[0]
|
||||
use_gdc = supports_pdl(qcurr_hidden_states.device)
|
||||
shrink_config = {
|
||||
"BLOCK_SIZE_M": block_size_m,
|
||||
"BLOCK_SIZE_N": block_size_n,
|
||||
@@ -326,6 +326,7 @@ def _fused_moe_lora_expand(
|
||||
split_k: int,
|
||||
mul_routed_weight: bool = False,
|
||||
offset: int = 0,
|
||||
use_gdc: bool = False,
|
||||
) -> None:
|
||||
b_ptr = _get_ptr(lora_b_stacked, device)
|
||||
K = max_lora_rank
|
||||
@@ -337,7 +338,6 @@ def _fused_moe_lora_expand(
|
||||
-1, a_intermediate_cache1.shape[3]
|
||||
)
|
||||
|
||||
use_gdc = supports_pdl(a_intermediate_cache1.device)
|
||||
expand_config = {
|
||||
"BLOCK_SIZE_M": block_size_m,
|
||||
"BLOCK_SIZE_N": block_size_n,
|
||||
@@ -466,7 +466,7 @@ def _fused_moe_lora(
|
||||
dtype=output.dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
use_gdc = supports_pdl(device) and not fully_sharded
|
||||
_fused_moe_lora_shrink(
|
||||
a_intermediate_cache1,
|
||||
qcurr_hidden_states,
|
||||
@@ -495,6 +495,7 @@ def _fused_moe_lora(
|
||||
shrink_num_stages,
|
||||
shrink_split_k,
|
||||
mul_routed_weight,
|
||||
use_gdc=use_gdc,
|
||||
)
|
||||
|
||||
if fully_sharded:
|
||||
@@ -542,6 +543,7 @@ def _fused_moe_lora(
|
||||
expand_split_k,
|
||||
mul_routed_weight,
|
||||
offset,
|
||||
use_gdc=use_gdc,
|
||||
)
|
||||
|
||||
|
||||
@@ -604,6 +606,7 @@ def _fused_moe_lora_shrink_fake(
|
||||
num_stages: int,
|
||||
split_k: int,
|
||||
mul_routed_weight: bool = False,
|
||||
use_gdc: bool = False,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
@@ -637,6 +640,7 @@ def _fused_moe_lora_expand_fake(
|
||||
num_stages: int,
|
||||
split_k: int,
|
||||
mul_routed_weight: bool = False,
|
||||
use_gdc: bool = False,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
@@ -14,8 +14,6 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
from .utils import supports_pdl
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _lora_expand_kernel(
|
||||
@@ -241,7 +239,9 @@ def _lora_expand(
|
||||
# thread blocks simply exit.
|
||||
MAX_LORAS,
|
||||
)
|
||||
use_gdc = supports_pdl(inputs.device)
|
||||
# We disable PDL temporarily because LoRA kernels are not launching back-to-back,
|
||||
# making PDL invalid and affecting the kernel performance.
|
||||
use_gdc = False # supports_pdl(inputs.device)
|
||||
_lora_expand_kernel[grid](
|
||||
inputs,
|
||||
lora_ptr_tensor,
|
||||
|
||||
@@ -14,8 +14,6 @@ from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
from .utils import supports_pdl
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _lora_shrink_kernel(
|
||||
@@ -221,7 +219,9 @@ def _lora_shrink(
|
||||
# thread blocks exit early.
|
||||
MAX_LORAS,
|
||||
)
|
||||
use_gdc = supports_pdl(inputs.device)
|
||||
# We disable PDL temporarily because LoRA kernels are not launching back-to-back,
|
||||
# making PDL invalid and affecting the kernel performance.
|
||||
use_gdc = False # supports_pdl(inputs.device)
|
||||
_lora_shrink_kernel[grid](
|
||||
inputs,
|
||||
lora_ptr_tensor,
|
||||
|
||||
Reference in New Issue
Block a user