[Kernel][CPU] CPU MLA (#14744)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-03-25 17:34:59 +08:00
parent 4157f563b4
commit 4f044b1d67
15 changed files with 1010 additions and 17 deletions
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -37,6 +37,9 @@ class CpuPlatform(Platform):
                             use_mla: bool) -> str:
        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        if use_mla:
+            logger.info("Using CPU MLA backend.")
+            return "vllm.attention.backends.cpu_mla.CPUMLABackend"
        logger.info("Using Torch SDPA backend.")
        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"

@@ -129,9 +132,6 @@ class CpuPlatform(Platform):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

-        # MLA attention is not supported
-        os.environ["VLLM_MLA_DISABLE"] = "1"
-
        # Intel OpenMP setting
        ld_prealod_str = os.getenv("LD_PRELOAD", "")
        if "libiomp5.so" in ld_prealod_str: