[ROCm] Enable chunked prefill/paged attention in MLA on ROCm (#14316)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-03-12 08:51:20 -07:00
parent 4a754fcf15
commit d9f83d6206
2 changed files with 4 additions and 18 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3450,9 +3450,9 @@ class VllmConfig:
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

        if self.model_config and self.model_config.use_mla and \
-            not current_platform.is_cuda():
+            not (current_platform.is_cuda() or current_platform.is_rocm()):
            logger.info(
-                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "MLA is enabled on a non-GPU platform; forcing chunked "
                "prefill and prefix caching to be disabled.")
            self.scheduler_config.enable_chunked_prefill = False
            self.scheduler_config.chunked_prefill_enabled = False