[Attention] MLA get rid of materialization (#14770)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-14 02:39:02 -04:00
parent 0c2af17c76
commit 9532c49836
4 changed files with 114 additions and 496 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -84,8 +84,6 @@ if TYPE_CHECKING:
    VLLM_SERVER_DEV_MODE: bool = False
    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
    VLLM_MLA_DISABLE: bool = False
-    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
-    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
@@ -563,23 +561,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),

-    # Flag that can control whether or not we perform matrix-absorption for MLA
-    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
-    # matrices reduces the runtime FLOPs needed to compute MLA but requires
-    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
-    # the is enabled by default
-    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
-    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
-
-    # When running MLA with matrix-absorption enabled and fp8 quantized weights
-    # we perform the matrix-absorption in float32 precision, after the matrices
-    # are absorbed we requantize the weights back to fp8, this flag can be used
-    # to disable the requantization step, and instead convert the absorbed
-    # matrices to match the activation type. This can lead to higher memory and
-    # compute usage but better preserves the accuracy of the original model.
-    "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
-
    # If set, vLLM will use the Triton implementation of moe_align_block_size,
    # i.e. moe_align_block_size_triton in fused_moe.py.
    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":