[Attention] MLA with chunked prefill (#12639)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-21 18:30:12 -05:00
parent 900edbfa48
commit 288cc6c234
18 changed files with 1910 additions and 1275 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3332,19 +3332,6 @@ class VllmConfig:

        current_platform.check_and_update_config(self)

-        # If MLA is enabled, force disable chunked prefill and prefix caching
-        if self.model_config and self.model_config.use_mla:
-            logger.info("MLA is enabled; forcing chunked prefill and prefix "
-                        "caching to be disabled.")
-            self.scheduler_config.enable_chunked_prefill = False
-            self.scheduler_config.chunked_prefill_enabled = False
-            self.scheduler_config.max_num_batched_tokens = max(
-                self.scheduler_config.max_model_len,
-                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
-
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]