[Attention] MLA with chunked prefill (#12639)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
@@ -1170,9 +1170,9 @@ class EngineArgs:
|
||||
# long context (> 32K) models. This is to avoid OOM errors in the
|
||||
# initial memory profiling phase.
|
||||
|
||||
# For multimodal models, chunked prefill is disabled by default in
|
||||
# V0, but enabled by design in V1
|
||||
if model_config.is_multimodal_model:
|
||||
# For multimodal models and models with MLA, chunked prefill is
|
||||
# disabled by default in V0, but enabled by design in V1
|
||||
if model_config.is_multimodal_model or model_config.use_mla:
|
||||
self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
|
||||
|
||||
elif use_long_context:
|
||||
@@ -1207,7 +1207,6 @@ class EngineArgs:
|
||||
msg = "Chunked prefill is not supported for pooling models"
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
speculative_config = SpeculativeConfig.maybe_create_spec_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
|
||||
Reference in New Issue
Block a user