[Quantization][Deprecation] Remove DeepSpeedFp8 (#32679)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-21 09:32:12 -05:00
parent 42135d6898
commit cea3c754c4
5 changed files with 19 additions and 284 deletions
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -9,14 +9,12 @@ Example usage:

 python save_sharded_state.py \
    --model /path/to/load \
-    --quantization deepspeedfp \
    --tensor-parallel-size 8 \
    --output /path/to/save/sharded/model

 python load_sharded_state.py \
    --model /path/to/saved/sharded/model \
    --load-format sharded_state \
-    --quantization deepspeedfp \
    --tensor-parallel-size 8 \
    --prompt "Hello, my name is" \
    --max-tokens 50
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -9,7 +9,6 @@ Example usage:

 python save_sharded_state.py \
    --model /path/to/load \
-    --quantization deepspeedfp \
    --tensor-parallel-size 8 \
    --output /path/to/save

@@ -18,7 +17,6 @@ Then, the model can be loaded with
 llm = LLM(
    model="/path/to/save",
    load_format="sharded_state",
-    quantization="deepspeedfp",
    tensor_parallel_size=8,
 )
 """
--- a/vllm/model_executor/layers/quantization/init.py
+++ b/vllm/model_executor/layers/quantization/init.py
@@ -11,7 +11,6 @@ logger = init_logger(__name__)

 QuantizationMethods = Literal[
    "awq",
-    "deepspeedfp",
    "fp8",
    "ptpc_fp8",
    "fbgemm_fp8",
@@ -42,7 +41,6 @@ QuantizationMethods = Literal[
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))

 DEPRECATED_QUANTIZATION_METHODS = [
-    "deepspeedfp",
    "tpu_int8",
    "ptpc_fp8",
    "fbgemm_fp8",
@@ -126,7 +124,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
        CompressedTensorsConfig,
    )
    from .cpu_wna16 import CPUAWQConfig
-    from .deepspeedfp import DeepSpeedFPConfig
    from .experts_int8 import ExpertsInt8Config
    from .fbgemm_fp8 import FBGEMMFp8Config
    from .fp8 import Fp8Config
@@ -149,7 +146,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:

    method_to_config: dict[str, type[QuantizationConfig]] = {
        "awq": AWQConfig,
-        "deepspeedfp": DeepSpeedFPConfig,
        "fp8": Fp8Config,
        "fbgemm_fp8": FBGEMMFp8Config,
        "fp_quant": FPQuantConfig,
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,218 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from packaging import version
-
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import (
-    QuantizationConfig,
-    QuantizationMethods,
-)
-from vllm.model_executor.utils import set_weight_attrs
-
-
-class DeepSpeedFPConfig(QuantizationConfig):
-    """Config for DeepSpeed FP quantizer. It supports fp6 and fp8.
-
-    Args:
-        weight_bits: the target quantization bits, 6 or 8.
-        group_size: group size for quantizaiton, default to 128.
-    """
-
-    def __init__(
-        self,
-        weight_bits: int = 8,
-        group_size: int = 512,
-    ) -> None:
-        super().__init__()
-        self.weight_bits = weight_bits
-        self.group_size = group_size
-        self.valid_types = [torch.bfloat16, torch.float16]
-
-        if self.weight_bits not in (6, 8):
-            raise ValueError(
-                "Currently, only 6-bit or 8-bit weight quantization are "
-                f"supported for DeepSpeed FP quantizaiton, but got "
-                f"{self.weight_bits} bits."
-            )
-
-    def __repr__(self) -> str:
-        return (
-            f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
-            f"group_size={self.group_size}"
-        )
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "deepspeedfp"
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig":
-        weight_bits = cls.get_from_keys(config, ["bits"])
-        group_size = cls.get_from_keys(config, ["group_size"])
-        return cls(weight_bits=weight_bits, group_size=group_size)
-
-    def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
-        return DeepSpeedFPLinearMethod(self)
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half, torch.bfloat16]
-
-    @classmethod
-    # Need to figure it out
-    def get_min_capability(cls) -> int:
-        return 60
-
-    @staticmethod
-    def get_config_filenames() -> list[str]:
-        return [
-            "quant_config.json",
-            "quantize_config.json",
-        ]
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["DeepSpeedFPLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return DeepSpeedFPLinearMethod(self)
-        return None
-
-
-class DeepSpeedFPLinearMethod(LinearMethodBase):
-    """Linear method for DeepSpeedFP quantizer.
-
-    Args:
-        quant_config: the DeepSpeedFP quantization config.
-    """
-
-    def __init__(self, quant_config: DeepSpeedFPConfig):
-        self.quant_config = quant_config
-        self.weight = None
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        weight_loader=None,
-        **extra_weight_attrs,
-    ):
-        del output_size
-        del input_size
-        output_size_per_partition = sum(output_partition_sizes)
-        weight = DeepSpeedFPParameter(
-            torch.Size((output_size_per_partition, input_size_per_partition)),
-            params_dtype=params_dtype,
-            quant_config=self.quant_config,
-        )
-        set_weight_attrs(
-            weight,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-            },
-        )
-        layer.register_parameter("weight", weight)
-
-        def quant_weight_loader(param, loaded_weight, *args, **kwargs):
-            # Calls the original weight loader (if any), quantizes the result,
-            # and then loads the quantized parameter.
-            if weight_loader is not None:
-                orig_param_data = param.data
-                param.data = param.ds_dequantize()
-                weight_loader(param, loaded_weight, *args, **kwargs)
-                param.data, loaded_weight = orig_param_data, param.data
-            param.ds_quantize_(loaded_weight.cuda())
-
-        extra_weight_attrs["weight_loader"] = quant_weight_loader
-        set_weight_attrs(weight, extra_weight_attrs)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        weight = layer.weight
-        y = weight.ds_dequantize()
-        return F.linear(x, y, bias)
-
-
-class DeepSpeedFPParameter(nn.Parameter):
-    """
-    DeepSpeedFP quantized parameter class that implements fp8/fp6
-    quantization deepspeed. Weights are stored in quantized form on
-    GPUs, and can be dequantized on-the-fly when needed by the model.
-    """
-
-    def __new__(
-        cls,
-        orig_shape: torch.Size,
-        params_dtype: torch.dtype,
-        quant_config: DeepSpeedFPConfig,
-    ):
-        try:
-            import deepspeed
-
-            if version.parse(deepspeed.__version__) < version.parse("0.14.2"):
-                raise ImportError(
-                    "deepspeed version is wrong. Please install deepspeed>=0.14.2."
-                )
-            from deepspeed.ops.fp_quantizer import FP_Quantize
-        except ImportError as err:
-            raise ImportError(
-                "Please install deepspeed>=0.14.2 via "
-                "`pip install deepspeed>=0.14.2` to use "
-                "deepspeedfp quantizer."
-            ) from err
-        data = torch.empty(
-            (
-                orig_shape.numel() // quant_config.group_size,
-                quant_config.group_size * quant_config.weight_bits // 8 + 4,
-            ),
-            dtype=torch.int8,
-        )
-        self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
-        self.orig_shape = orig_shape
-        self.quant_config = quant_config
-        self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
-        self.fp_quantizer.orig_shape = orig_shape
-        self.fp_quantizer.orig_dtype = params_dtype
-        return self
-
-    def ds_quantize_(self, tensor: torch.Tensor):
-        assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
-        return self.data.copy_(
-            self.fp_quantizer.quantize(
-                tensor.data,
-                q_bits=self.quant_config.weight_bits,
-            )
-        )
-
-    def ds_dequantize(self, fp_out=None) -> torch.Tensor:
-        """
-        Return a tensor containing the dequantized weights of this parameter.
-        """
-        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
-        return self.fp_quantizer.dequantize(
-            self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits
-        )
-
-    def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
-        """
-        Return a tensor where only the weights at `indices` are dequantized
-        (to save HBM -> SRAM bandwidth).
-        """
-        assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
-        return self.fp_quantizer.selective_dequantize(
-            self.data, indices, fp_out=fp_out, q_bits=self.quant_config.weight_bits
-        )
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -29,10 +29,6 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.quantization.deepspeedfp import (
-    DeepSpeedFPConfig,
-    DeepSpeedFPParameter,
-)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    ParallelLMHead,
@@ -128,7 +124,6 @@ class ArcticMoE(nn.Module):
        self.intermediate_size = config.intermediate_size // self.tp_size

        self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
-        self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
        self.reduce_results = reduce_results
        # Some other parameters
        if params_dtype is None:
@@ -151,40 +146,24 @@ class ArcticMoE(nn.Module):
                quant_config=quant_config,
                prefix=f"{prefix}.gate",
            )
-            if self.is_quant:
-                self.ws = DeepSpeedFPParameter(
-                    torch.Size(
-                        (self.num_experts, 2 * self.intermediate_size, self.hidden_size)
-                    ),
-                    params_dtype=params_dtype,
-                    quant_config=quant_config,
+            self.ws = nn.Parameter(
+                torch.empty(
+                    self.num_experts,
+                    2 * self.intermediate_size,
+                    self.hidden_size,
+                    device=current_platform.device_type,
+                    dtype=self.params_dtype,
                )
-                self.w2s = DeepSpeedFPParameter(
-                    torch.Size(
-                        (self.num_experts, self.hidden_size, self.intermediate_size)
-                    ),
-                    params_dtype=params_dtype,
-                    quant_config=quant_config,
-                )
-            else:
-                self.ws = nn.Parameter(
-                    torch.empty(
-                        self.num_experts,
-                        2 * self.intermediate_size,
-                        self.hidden_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype,
-                    )
-                )
-                self.w2s = nn.Parameter(
-                    torch.empty(
-                        self.num_experts,
-                        self.hidden_size,
-                        self.intermediate_size,
-                        device=current_platform.device_type,
-                        dtype=self.params_dtype,
-                    )
+            )
+            self.w2s = nn.Parameter(
+                torch.empty(
+                    self.num_experts,
+                    self.hidden_size,
+                    self.intermediate_size,
+                    device=current_platform.device_type,
+                    dtype=self.params_dtype,
                )
+            )
            set_weight_attrs(
                self.ws,
                {
@@ -206,7 +185,7 @@ class ArcticMoE(nn.Module):
        expert_id: int,
    ):
        tp_rank = get_tensor_model_parallel_rank()
-        param_data = param.ds_dequantize() if self.is_quant else param.data
+        param_data = param.data
        shard_size = self.intermediate_size
        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
        if weight_name.endswith("w1.weight"):
@@ -217,8 +196,6 @@ class ArcticMoE(nn.Module):
            ]
        if weight_name.endswith("w2.weight"):
            param_data[expert_id, :, :] = loaded_weight[:, shard]
-        if self.is_quant:
-            param.ds_quantize_(param_data)

    def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
        num_tokens, hidden_size = hidden_states.shape
@@ -229,26 +206,10 @@ class ArcticMoE(nn.Module):
        topk_weights, topk_ids, token_expert_indices = fused_topk(
            hidden_states, router_logits, self.top_k, renormalize=do_normalize
        )
-        # topk_ids: (num_tokens, k)
-        if self.is_quant:
-            if 2 * num_tokens <= self.num_experts:
-                # If much fewer tokens than experts, use selective dequantize.
-                ws_dequantized = self.ws.ds_selective_dequantize(topk_ids.flatten())
-                w2s_dequantized = self.w2s.ds_selective_dequantize(topk_ids.flatten())
-                # We gathered the experts to the tokens so update the mapping.
-                topk_ids = torch.arange(
-                    0,
-                    topk_ids.numel(),
-                    device=topk_ids.device,
-                ).reshape(topk_ids.shape)
-            else:
-                ws_dequantized = self.ws.ds_dequantize()
-                w2s_dequantized = self.w2s.ds_dequantize()
-
        final_hidden_states = fused_experts(
            hidden_states,
-            ws_dequantized if self.is_quant else self.ws,
-            w2s_dequantized if self.is_quant else self.w2s,
+            self.ws,
+            self.w2s,
            topk_weights,
            topk_ids,
            inplace=True,