From 78c13e30e1641869672b4c5fb7685d04e58ca1df Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Wed, 23 Jul 2025 15:59:30 -0700
Subject: [PATCH] [V1] Fix local chunked attention always disabled (#21419)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/attention/layer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 1b80fa19d..178453ecd 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -143,6 +143,8 @@ class Attention(nn.Module):
         # the backends)
         if envs.VLLM_USE_V1:
             self.use_irope = extra_impl_args.pop("use_irope", False)
+        else:
+            self.use_irope = extra_impl_args.get("use_irope", False)
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
@@ -177,7 +179,6 @@ class Attention(nn.Module):
                              kv_sharing_target_layer_name, **extra_impl_args)
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
-        self.use_irope = extra_impl_args.get("use_irope", False)
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant