[ROCm] Disable chunked prefill/prefix caching when running MLA on non-cuda platforms (#13844)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-02-25 22:56:58 -08:00
parent e656f638de
commit 1d35662e6d
2 changed files with 44 additions and 12 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3422,6 +3422,20 @@ class VllmConfig:
                           "Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

+        if self.model_config and self.model_config.use_mla and \
+            not current_platform.is_cuda():
+            logger.info(
+                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.max_num_batched_tokens = max(
+                self.scheduler_config.max_model_len,
+                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
        current_platform.check_and_update_config(self)

        if not self.instance_id: