[Attention] MLA with chunked prefill (#12639)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Patrick Horn <patrick.horn@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Lucas Wilkinson
2025-02-21 18:30:12 -05:00
committed by GitHub
parent 900edbfa48
commit 288cc6c234
18 changed files with 1910 additions and 1275 deletions

View File

@@ -1170,9 +1170,9 @@ class EngineArgs:
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
# For multimodal models, chunked prefill is disabled by default in
# V0, but enabled by design in V1
if model_config.is_multimodal_model:
# For multimodal models and models with MLA, chunked prefill is
# disabled by default in V0, but enabled by design in V1
if model_config.is_multimodal_model or model_config.use_mla:
self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
elif use_long_context:
@@ -1207,7 +1207,6 @@ class EngineArgs:
msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg)
speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config,
target_parallel_config=parallel_config,