Revert "[Perf] Reduce MLA CPU overheads in V1 (#14384)" (#14471)

This commit is contained in:
Tyler Michael Smith
2025-03-08 01:18:53 -05:00
committed by GitHub
parent 333681408f
commit ca7a2d5f28
2 changed files with 6 additions and 18 deletions

View File

@@ -222,8 +222,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
scaled_quantize)
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
from vllm.model_executor.layers.rotary_embedding import (
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
from vllm.utils import cdiv, round_down
try:
@@ -627,15 +627,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
self.v_head_dim = v_head_dim
self.rotary_emb = rotary_emb
if current_platform.is_cuda():
# Hack for V1 for now to avoid torch library overhead (since we are
# already inside an attention custom op), pull out the forward
# method from the rotary embedding and call it directly (and avoid
# calling forward_native, when we can call forward_cuda)
# TODO(lucas): we should probably find a cleaner way to do this
self.rotary_emb = rotary_emb.forward_cuda
self.use_yarn_rope = isinstance(rotary_emb,
DeepseekScalingRotaryEmbedding)
self.q_proj = q_proj
self.kv_b_proj = kv_b_proj
self.o_proj = o_proj