dynamic distpatch of fp8 kernels (#14245)

Signed-off-by: Jeff Daily <jeff.daily@amd.com>
2025-03-11 07:54:56 -07:00
parent 08a1a1121d
commit a1c8f3796c
25 changed files with 292 additions and 159 deletions
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -219,7 +219,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
    CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
@@ -826,7 +826,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                W_Q_UK, W_Q_UK_scales = scaled_quantize(
                    W_Q_UK,
                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                # For FP8 save the transpose so we can use
                # `apply_w8a8_block_fp8_linear` directly
                self.W_Q_UK = W_Q_UK.T.contiguous()
@@ -843,7 +843,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                W_UV_O, W_UV_O_scales = scaled_quantize(
                    W_UV_O,
                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                # For FP8 save the transpose so we can use
                # `apply_w8a8_block_fp8_linear` directly
                self.W_UV_O = W_UV_O.T.contiguous()