[Core] Optimizing cross-attention QKVParallelLinear computation (#12325)

Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal> Co-authored-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal>
2025-03-06 10:37:26 +01:00
parent 5d802522a7
commit 69ff99fdcd
4 changed files with 121 additions and 44 deletions
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -650,4 +650,4 @@ def cast_overflow_tensors(
    if tensors.isinf().any() or tensors.isnan().any():
        clamp_value = torch.finfo(tensors.dtype).max - offset
        tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
+    return tensors