From da6fa2f1d6312c111fcfe7684e286d9de915f9c0 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 19 May 2026 16:43:28 +0000
Subject: [PATCH] Fix UnboundLocalError: move num_decode_tokens before debug
 print

---
 vllm/patches/deepseek_v4_attention.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py
index 09364b87..3165490d 100644
--- a/vllm/patches/deepseek_v4_attention.py
+++ b/vllm/patches/deepseek_v4_attention.py
@@ -653,13 +653,17 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
             rope_dim=self.rope_head_dim,
         )
 
+        # Split prefill and decode
+        num_decode_tokens = swa_metadata.num_decode_tokens
+        num_prefills = swa_metadata.num_prefill_tokens
+        swa_only = self.compress_ratio <= 1
+
         # CRITICAL FIX: Write KV to paged cache (RoPE + fp8 quant + insert)
         if not hasattr(self, '_swa_inv_scale_cache'):
             max_slots = swa_kv_cache.shape[0] * swa_kv_cache.shape[1]
             self._swa_inv_scale_cache = torch.zeros(
                 max_slots, 1, dtype=torch.bfloat16, device=kv.device,
             )
-            # Debug: log cache shape info
             import sys
             print(f"[BLACKWELL] swa_kv_cache shape: {swa_kv_cache.shape}, "
                   f"block_size: {swa_metadata.block_size}, "
@@ -677,11 +681,6 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
             rope_dim=self.rope_head_dim,
         )
 
-        # Split prefill and decode
-        num_decode_tokens = swa_metadata.num_decode_tokens
-        num_prefills = swa_metadata.num_prefill_tokens
-        swa_only = self.compress_ratio <= 1
-
         # Get compressed KV cache and indexer metadata for CSA/HCA
         flashmla_metadata = None
         if not swa_only: