[FP8]add FP8 WoQ kernel abstraction. (#32929)

Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
2026-03-23 17:47:47 +08:00
parent 35141a7eed
commit 27d5ee3e6f
5 changed files with 177 additions and 83 deletions
--- a/vllm/model_executor/kernels/linear/init.py
+++ b/vllm/model_executor/kernels/linear/init.py
@@ -72,6 +72,9 @@ from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
 from vllm.model_executor.kernels.linear.scaled_mm.flashinfer import (
    FlashInferFP8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.marlin import (
    MarlinFP8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.pytorch import (
    ChannelWiseTorchFP8ScaledMMLinearKernel,
    PerTensorTorchFP8ScaledMMLinearKernel,
@@ -104,6 +107,7 @@ _POSSIBLE_INT8_KERNELS: dict[PlatformEnum, list[type[Int8ScaledMMLinearKernel]]]
 # in priority/performance order (when available)
 _POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
    PlatformEnum.CUDA: [
        MarlinFP8ScaledMMLinearKernel,
        FlashInferFP8ScaledMMLinearKernel,
        CutlassFP8ScaledMMLinearKernel,
        PerTensorTorchFP8ScaledMMLinearKernel,
--- a/vllm/model_executor/kernels/linear/scaled_mm/init.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/init.py
@@ -14,6 +14,9 @@ from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
 from vllm.model_executor.kernels.linear.scaled_mm.flashinfer import (
    FlashInferFP8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.marlin import (
    MarlinFP8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.pytorch import (
    ChannelWiseTorchFP8ScaledMMLinearKernel,
    PerTensorTorchFP8ScaledMMLinearKernel,
@@ -46,6 +49,7 @@ __all__ = [
    "CutlassFP8ScaledMMLinearKernel",
    "CutlassInt8ScaledMMLinearKernel",
    "FlashInferFP8ScaledMMLinearKernel",
    "MarlinFP8ScaledMMLinearKernel",
    "ChannelWiseTorchFP8ScaledMMLinearKernel",
    "PerTensorTorchFP8ScaledMMLinearKernel",
    "RowWiseTorchFP8ScaledMMLinearKernel",
--- a/vllm/model_executor/kernels/linear/scaled_mm/marlin.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/marlin.py
@@ -0,0 +1,120 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 import torch
 import vllm.envs as envs
 from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    process_fp8_weight_block_strategy,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    apply_fp8_marlin_linear,
    is_fp8_marlin_supported,
    prepare_fp8_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    kFp8Static128BlockSym,
 )
 from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from .ScaledMMLinearKernel import (
    FP8ScaledMMLinearKernel,
    FP8ScaledMMLinearLayerConfig,
 )
 class MarlinFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
    """
    FP8 Marlin kernel for GPUs that lack FP8 hardware support.
    Leverages the Marlin kernel for fast weight-only FP8 quantization.
    """
    @classmethod
    def is_supported(
        cls, compute_capability: int | None = None
    ) -> tuple[bool, str | None]:
        if not current_platform.is_cuda():
            return False, "requires CUDA."
        # Check if platform supports FP8 Marlin
        if not is_fp8_marlin_supported():
            return False, "FP8 Marlin requires compute capability 7.5 or higher"
        if vllm_is_batch_invariant():
            return False, "FP8 Marlin not supported for batch invariant execution."
        if (
            compute_capability is not None
            and compute_capability >= 89
            and not envs.VLLM_TEST_FORCE_FP8_MARLIN
        ):
            return (
                False,
                "To apply FP8 Marlin on high-capability GPUs, please set "
                "VLLM_TEST_FORCE_FP8_MARLIN=1",
            )
        return True, None
    @classmethod
    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
        return True, None
    def __init__(
        self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
    ) -> None:
        super().__init__(c, layer_param_names)
        self.marlin_input_dtype = None
        self.block_quant = self.config.weight_quant_key in {kFp8Static128BlockSym}
        self.size_k_first = not self.block_quant
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        if self.block_quant:
            weight, weight_scale_inv = process_fp8_weight_block_strategy(
                layer.weight, layer.weight_scale_inv
            )
            # Update layer with new values
            replace_parameter(layer, "weight", weight.data)
            replace_parameter(layer, "weight_scale_inv", weight_scale_inv.data)
        else:
            weight = layer.weight.t()
            replace_parameter(layer, "weight", weight.data)
        layer.input_scale = None
        prepare_fp8_layer_for_marlin(
            layer, self.size_k_first, input_dtype=self.marlin_input_dtype
        )
        del layer.input_scale
    def apply_weights(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        if self.block_quant:
            weight_scale = layer.weight_scale_inv
        else:
            weight_scale = layer.weight_scale
        return apply_fp8_marlin_linear(
            input=x,
            weight=layer.weight,
            weight_scale=weight_scale,
            workspace=layer.workspace,
            size_n=layer.output_size_per_partition,
            size_k=layer.input_size_per_partition,
            input_dtype=self.marlin_input_dtype,
            bias=bias,
        )
    def apply_scaled_mm(
        self,
        *,
        A: torch.Tensor,
        B: torch.Tensor,
        out_dtype: torch.dtype,
        As: torch.Tensor,
        Bs: torch.Tensor,
        bias: torch.Tensor | None,
        output_shape: list,
    ) -> torch.Tensor:
        pass
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -22,7 +22,6 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizeMethodBase,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    apply_fp8_marlin_linear,
    prepare_fp8_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -177,15 +176,4 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
        if self.quant_config.use_marlin:
            return apply_fp8_marlin_linear(
                input=x,
                weight=layer.weight,
                weight_scale=layer.weight_scale,
                workspace=layer.workspace,
                size_n=layer.output_size_per_partition,
                size_k=layer.input_size_per_partition,
                bias=bias,
            )
        return self.fp8_linear.apply_weights(layer, x, bias)
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -7,7 +7,6 @@ import torch
 from torch.nn import Module
 from torch.utils._python_dispatch import TorchDispatchMode
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
@@ -16,6 +15,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.kernels.linear import (
    init_fp8_linear_kernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm import MarlinFP8ScaledMMLinearKernel
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.batch_invariant import (
    vllm_is_batch_invariant,
@@ -61,10 +61,6 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    get_marlin_input_dtype,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    apply_fp8_marlin_linear,
    prepare_fp8_layer_for_marlin,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
    is_layer_skipped,
@@ -280,15 +276,6 @@ class Fp8LinearMethod(LinearMethodBase):
        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
        # kernel for fast weight-only FP8 quantization
        self.marlin_input_dtype = None
        self.use_marlin = (
            not current_platform.has_device_capability(89)
            or envs.VLLM_TEST_FORCE_FP8_MARLIN
        )
        # Disable marlin for rocm
        if current_platform.is_rocm() or current_platform.is_xpu():
            self.use_marlin = False
        if vllm_is_batch_invariant():
            self.use_marlin = False
        self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
        self.use_deep_gemm = is_deep_gemm_supported()
@@ -297,7 +284,28 @@ class Fp8LinearMethod(LinearMethodBase):
        self.block_quant = self.weight_block_size is not None
        self.act_q_static = self.quant_config.activation_scheme == "static"
        # Use per-token quantization for better perf if dynamic and cutlass
        if self.act_q_static:
            activation_quant_key = kFp8StaticTensorSym
        elif cutlass_fp8_supported():
            activation_quant_key = kFp8DynamicTokenSym
        else:
            activation_quant_key = kFp8DynamicTensorSym
        if self.block_quant:
            weight_quant_key = kFp8Static128BlockSym
        else:
            weight_quant_key = kFp8StaticTensorSym
        self.fp8_linear = init_fp8_linear_kernel(
            activation_quant_key=activation_quant_key,
            weight_quant_key=weight_quant_key,
            out_dtype=torch.get_default_dtype(),
            module_name=self.__class__.__name__,
        )
        self.use_marlin = isinstance(self.fp8_linear, MarlinFP8ScaledMMLinearKernel)
        if self.block_quant and not self.use_marlin:
            assert not self.act_q_static
            assert self.weight_block_size is not None
            self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
@@ -306,21 +314,6 @@ class Fp8LinearMethod(LinearMethodBase):
                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
            )
        else:
            # Use per-token quantization for better perf if dynamic and cutlass
            if self.act_q_static:
                activation_quant_key = kFp8StaticTensorSym
            elif cutlass_fp8_supported():
                activation_quant_key = kFp8DynamicTokenSym
            else:
                activation_quant_key = kFp8DynamicTensorSym
            self.fp8_linear = init_fp8_linear_kernel(
                activation_quant_key=activation_quant_key,
                weight_quant_key=kFp8StaticTensorSym,
                out_dtype=torch.get_default_dtype(),
                module_name=self.__class__.__name__,
            )
    def create_weights(
        self,
@@ -387,12 +380,18 @@ class Fp8LinearMethod(LinearMethodBase):
            layer.register_parameter("input_scale", scale)
    def process_weights_after_loading(self, layer: Module) -> None:
-        size_k_first = True
+        if self.use_marlin:
            # Only Marlin kernels support `marlin_input_dtype`; guard to avoid
            # AttributeError if backend selection changes.
            if hasattr(self.fp8_linear, "marlin_input_dtype"):
                self.fp8_linear.marlin_input_dtype = self.marlin_input_dtype
            self.fp8_linear.process_weights_after_loading(layer)
            return
        input_scale = None
        # TODO(rob): refactor block quant into separate class.
        if self.block_quant:
            assert not self.act_q_static
            size_k_first = False
            weight, weight_scale_inv = process_fp8_weight_block_strategy(
                layer.weight, layer.weight_scale_inv
@@ -411,16 +410,15 @@ class Fp8LinearMethod(LinearMethodBase):
            # If using w8a8, torch._scaled_mm needs per tensor, so
            # requantize the logical shards as a single weight.
-            if not self.use_marlin:
+            weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
-                weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
+                weight,
-                    weight,
+                weight_scale,
-                    weight_scale,
+                layer.logical_widths,
-                    layer.logical_widths,
+                getattr(layer, "input_scale", None),
-                    getattr(layer, "input_scale", None),
+            )
-                )
+            if self.act_q_static:
-                if self.act_q_static:
+                assert input_scale is not None
-                    assert input_scale is not None
+                input_scale = input_scale.max()
                    input_scale = input_scale.max()
            weight = weight.t()
            # Update layer with new values.
@@ -432,14 +430,6 @@ class Fp8LinearMethod(LinearMethodBase):
        else:
            layer.input_scale = None
        if self.use_marlin:
            prepare_fp8_layer_for_marlin(
                layer, size_k_first, input_dtype=self.marlin_input_dtype
            )
            # Activations not quantized for marlin.
            del layer.input_scale
            return
        if self.block_quant:
            maybe_post_process_fp8_weight_block(layer)
@@ -486,21 +476,7 @@ class Fp8LinearMethod(LinearMethodBase):
                return torch.nn.functional.linear(x, weight_bf16.t(), bias)
        if self.use_marlin:
-            if self.block_quant:
+            return self.fp8_linear.apply_weights(layer, x, bias)
                weight_scale = layer.weight_scale_inv
            else:
                weight_scale = layer.weight_scale
            return apply_fp8_marlin_linear(
                input=x,
                weight=layer.weight,
                weight_scale=weight_scale,
                workspace=layer.workspace,
                size_n=layer.output_size_per_partition,
                size_k=layer.input_size_per_partition,
                input_dtype=self.marlin_input_dtype,
                bias=bias,
            )
        if self.block_quant:
            assert self.weight_block_size is not None
@@ -623,18 +599,20 @@ class Fp8OnlineLinearMethod(Fp8LinearMethod):
        layer.input_scale = None
        qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
        weight = qweight.t()
        # Update layer with new values.
-        replace_parameter(layer, "weight", weight.data)
+        replace_parameter(layer, "weight", qweight.data)
        replace_parameter(layer, "weight_scale", weight_scale.data)
        if self.use_marlin:
-            size_k_first = True
+            # Only Marlin kernels support `marlin_input_dtype`; guard to avoid
-            prepare_fp8_layer_for_marlin(
+            # AttributeError if backend selection changes.
-                layer, size_k_first, input_dtype=self.marlin_input_dtype
+            if hasattr(self.fp8_linear, "marlin_input_dtype"):
-            )
+                self.fp8_linear.marlin_input_dtype = self.marlin_input_dtype
-            # Activations not quantized for marlin.
+            self.fp8_linear.process_weights_after_loading(layer)
        else:
            weight = qweight.t()
            replace_parameter(layer, "weight", weight.data)
        # Prevent duplicate processing (e.g., during weight reload)
        layer._already_called_process_weights_after_loading = True