From 78c13e30e1641869672b4c5fb7685d04e58ca1df Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:59:30 -0700 Subject: [PATCH] [V1] Fix local chunked attention always disabled (#21419) Signed-off-by: Yong Hoon Shin --- vllm/attention/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1b80fa19d..178453ecd 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -143,6 +143,8 @@ class Attention(nn.Module): # the backends) if envs.VLLM_USE_V1: self.use_irope = extra_impl_args.pop("use_irope", False) + else: + self.use_irope = extra_impl_args.get("use_irope", False) quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None @@ -177,7 +179,6 @@ class Attention(nn.Module): kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype - self.use_irope = extra_impl_args.get("use_irope", False) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant