[Attention] Deepseek v3 MLA support with FP8 compute (#12601)
This PR implements the Deepseek V3 support by performing matrix absorption the fp8 weights --------- Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
This commit is contained in:
12
vllm/envs.py
12
vllm/envs.py
@@ -79,6 +79,7 @@ if TYPE_CHECKING:
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
|
||||
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -519,7 +520,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
# storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
|
||||
# the is enabled by default
|
||||
"VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1")))
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
|
||||
|
||||
# When running MLA with matrix-absorption enabled and fp8 quantized weights
|
||||
# we perform the matrix-absorption in float32 precision, after the matrices
|
||||
# are absorbed we requantize the weights back to fp8, this flag can be used
|
||||
# to disable the requantization step, and instead convert the absorbed
|
||||
# matrices to match the activation type. This can lead to higher memory and
|
||||
# compute usage but better preserves the accuracy of the original model.
|
||||
"VLLM_MLA_DISABLE_REQUANTIZATION":
|
||||
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
Reference in New Issue
Block a user