[Model][Perf] Use cos and sin cache in QwenVL (#28798)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
2025-11-18 19:51:54 +08:00
parent 285eaa4285
commit b9489f51e1
6 changed files with 218 additions and 217 deletions
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -83,6 +83,11 @@ class RotaryEmbeddingBase(CustomOp):
        ):
            self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)

+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+

 class RotaryEmbedding(RotaryEmbeddingBase):
    def __init__(