Override attention metadata for fast prefill in some KV sharing setups (#21590)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
Yong Hoon Shin
2025-07-30 08:54:15 -07:00
committed by GitHub
parent 366f6b3a4d
commit ad510309ee
6 changed files with 287 additions and 26 deletions

View File

@@ -793,6 +793,7 @@ class Gemma3nForConditionalGeneration(nn.Module):
del lora_config # Unused.
super().__init__()
self.config = config
self.cache_config = vllm_config.cache_config
self.model = Gemma3nModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
self.logits_processor = LogitsProcessor(