From da6fa2f1d6312c111fcfe7684e286d9de915f9c0 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 19 May 2026 16:43:28 +0000 Subject: [PATCH] Fix UnboundLocalError: move num_decode_tokens before debug print --- vllm/patches/deepseek_v4_attention.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/patches/deepseek_v4_attention.py b/vllm/patches/deepseek_v4_attention.py index 09364b87..3165490d 100644 --- a/vllm/patches/deepseek_v4_attention.py +++ b/vllm/patches/deepseek_v4_attention.py @@ -653,13 +653,17 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): rope_dim=self.rope_head_dim, ) + # Split prefill and decode + num_decode_tokens = swa_metadata.num_decode_tokens + num_prefills = swa_metadata.num_prefill_tokens + swa_only = self.compress_ratio <= 1 + # CRITICAL FIX: Write KV to paged cache (RoPE + fp8 quant + insert) if not hasattr(self, '_swa_inv_scale_cache'): max_slots = swa_kv_cache.shape[0] * swa_kv_cache.shape[1] self._swa_inv_scale_cache = torch.zeros( max_slots, 1, dtype=torch.bfloat16, device=kv.device, ) - # Debug: log cache shape info import sys print(f"[BLACKWELL] swa_kv_cache shape: {swa_kv_cache.shape}, " f"block_size: {swa_metadata.block_size}, " @@ -677,11 +681,6 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer): rope_dim=self.rope_head_dim, ) - # Split prefill and decode - num_decode_tokens = swa_metadata.num_decode_tokens - num_prefills = swa_metadata.num_prefill_tokens - swa_only = self.compress_ratio <= 1 - # Get compressed KV cache and indexer metadata for CSA/HCA flashmla_metadata = None if not swa_only: