[Quantization][Deprecation] Remove DeepSpeedFp8 (#32679)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
Robert Shaw
2026-01-21 09:32:12 -05:00
committed by GitHub
parent 42135d6898
commit cea3c754c4
5 changed files with 19 additions and 284 deletions

View File

@@ -9,14 +9,12 @@ Example usage:
python save_sharded_state.py \
--model /path/to/load \
--quantization deepspeedfp \
--tensor-parallel-size 8 \
--output /path/to/save/sharded/model
python load_sharded_state.py \
--model /path/to/saved/sharded/model \
--load-format sharded_state \
--quantization deepspeedfp \
--tensor-parallel-size 8 \
--prompt "Hello, my name is" \
--max-tokens 50

View File

@@ -9,7 +9,6 @@ Example usage:
python save_sharded_state.py \
--model /path/to/load \
--quantization deepspeedfp \
--tensor-parallel-size 8 \
--output /path/to/save
@@ -18,7 +17,6 @@ Then, the model can be loaded with
llm = LLM(
model="/path/to/save",
load_format="sharded_state",
quantization="deepspeedfp",
tensor_parallel_size=8,
)
"""

View File

@@ -11,7 +11,6 @@ logger = init_logger(__name__)
QuantizationMethods = Literal[
"awq",
"deepspeedfp",
"fp8",
"ptpc_fp8",
"fbgemm_fp8",
@@ -42,7 +41,6 @@ QuantizationMethods = Literal[
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
DEPRECATED_QUANTIZATION_METHODS = [
"deepspeedfp",
"tpu_int8",
"ptpc_fp8",
"fbgemm_fp8",
@@ -126,7 +124,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
CompressedTensorsConfig,
)
from .cpu_wna16 import CPUAWQConfig
from .deepspeedfp import DeepSpeedFPConfig
from .experts_int8 import ExpertsInt8Config
from .fbgemm_fp8 import FBGEMMFp8Config
from .fp8 import Fp8Config
@@ -149,7 +146,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
method_to_config: dict[str, type[QuantizationConfig]] = {
"awq": AWQConfig,
"deepspeedfp": DeepSpeedFPConfig,
"fp8": Fp8Config,
"fbgemm_fp8": FBGEMMFp8Config,
"fp_quant": FPQuantConfig,

View File

@@ -1,218 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from packaging import version
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
QuantizationMethods,
)
from vllm.model_executor.utils import set_weight_attrs
class DeepSpeedFPConfig(QuantizationConfig):
"""Config for DeepSpeed FP quantizer. It supports fp6 and fp8.
Args:
weight_bits: the target quantization bits, 6 or 8.
group_size: group size for quantizaiton, default to 128.
"""
def __init__(
self,
weight_bits: int = 8,
group_size: int = 512,
) -> None:
super().__init__()
self.weight_bits = weight_bits
self.group_size = group_size
self.valid_types = [torch.bfloat16, torch.float16]
if self.weight_bits not in (6, 8):
raise ValueError(
"Currently, only 6-bit or 8-bit weight quantization are "
f"supported for DeepSpeed FP quantizaiton, but got "
f"{self.weight_bits} bits."
)
def __repr__(self) -> str:
return (
f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), "
f"group_size={self.group_size}"
)
@classmethod
def get_name(cls) -> QuantizationMethods:
return "deepspeedfp"
@classmethod
def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig":
weight_bits = cls.get_from_keys(config, ["bits"])
group_size = cls.get_from_keys(config, ["group_size"])
return cls(weight_bits=weight_bits, group_size=group_size)
def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
return DeepSpeedFPLinearMethod(self)
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.half, torch.bfloat16]
@classmethod
# Need to figure it out
def get_min_capability(cls) -> int:
return 60
@staticmethod
def get_config_filenames() -> list[str]:
return [
"quant_config.json",
"quantize_config.json",
]
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> Optional["DeepSpeedFPLinearMethod"]:
if isinstance(layer, LinearBase):
return DeepSpeedFPLinearMethod(self)
return None
class DeepSpeedFPLinearMethod(LinearMethodBase):
"""Linear method for DeepSpeedFP quantizer.
Args:
quant_config: the DeepSpeedFP quantization config.
"""
def __init__(self, quant_config: DeepSpeedFPConfig):
self.quant_config = quant_config
self.weight = None
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
weight_loader=None,
**extra_weight_attrs,
):
del output_size
del input_size
output_size_per_partition = sum(output_partition_sizes)
weight = DeepSpeedFPParameter(
torch.Size((output_size_per_partition, input_size_per_partition)),
params_dtype=params_dtype,
quant_config=self.quant_config,
)
set_weight_attrs(
weight,
{
"input_dim": 1,
"output_dim": 0,
},
)
layer.register_parameter("weight", weight)
def quant_weight_loader(param, loaded_weight, *args, **kwargs):
# Calls the original weight loader (if any), quantizes the result,
# and then loads the quantized parameter.
if weight_loader is not None:
orig_param_data = param.data
param.data = param.ds_dequantize()
weight_loader(param, loaded_weight, *args, **kwargs)
param.data, loaded_weight = orig_param_data, param.data
param.ds_quantize_(loaded_weight.cuda())
extra_weight_attrs["weight_loader"] = quant_weight_loader
set_weight_attrs(weight, extra_weight_attrs)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
weight = layer.weight
y = weight.ds_dequantize()
return F.linear(x, y, bias)
class DeepSpeedFPParameter(nn.Parameter):
"""
DeepSpeedFP quantized parameter class that implements fp8/fp6
quantization deepspeed. Weights are stored in quantized form on
GPUs, and can be dequantized on-the-fly when needed by the model.
"""
def __new__(
cls,
orig_shape: torch.Size,
params_dtype: torch.dtype,
quant_config: DeepSpeedFPConfig,
):
try:
import deepspeed
if version.parse(deepspeed.__version__) < version.parse("0.14.2"):
raise ImportError(
"deepspeed version is wrong. Please install deepspeed>=0.14.2."
)
from deepspeed.ops.fp_quantizer import FP_Quantize
except ImportError as err:
raise ImportError(
"Please install deepspeed>=0.14.2 via "
"`pip install deepspeed>=0.14.2` to use "
"deepspeedfp quantizer."
) from err
data = torch.empty(
(
orig_shape.numel() // quant_config.group_size,
quant_config.group_size * quant_config.weight_bits // 8 + 4,
),
dtype=torch.int8,
)
self = torch.Tensor._make_subclass(cls, data, data.requires_grad)
self.orig_shape = orig_shape
self.quant_config = quant_config
self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size)
self.fp_quantizer.orig_shape = orig_shape
self.fp_quantizer.orig_dtype = params_dtype
return self
def ds_quantize_(self, tensor: torch.Tensor):
assert tensor.device.type == "cuda" and tensor.dtype != torch.int8
return self.data.copy_(
self.fp_quantizer.quantize(
tensor.data,
q_bits=self.quant_config.weight_bits,
)
)
def ds_dequantize(self, fp_out=None) -> torch.Tensor:
"""
Return a tensor containing the dequantized weights of this parameter.
"""
assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
return self.fp_quantizer.dequantize(
self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits
)
def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor:
"""
Return a tensor where only the weights at `indices` are dequantized
(to save HBM -> SRAM bandwidth).
"""
assert self.data.device.type == "cuda" and self.data.dtype == torch.int8
return self.fp_quantizer.selective_dequantize(
self.data, indices, fp_out=fp_out, q_bits=self.quant_config.weight_bits
)

View File

@@ -29,10 +29,6 @@ from vllm.model_executor.layers.linear import (
)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.deepspeedfp import (
DeepSpeedFPConfig,
DeepSpeedFPParameter,
)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead,
@@ -128,7 +124,6 @@ class ArcticMoE(nn.Module):
self.intermediate_size = config.intermediate_size // self.tp_size
self.is_moe_layer = (layer_id + 1) % config.moe_layer_frequency == 0
self.is_quant = isinstance(quant_config, DeepSpeedFPConfig)
self.reduce_results = reduce_results
# Some other parameters
if params_dtype is None:
@@ -151,40 +146,24 @@ class ArcticMoE(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.gate",
)
if self.is_quant:
self.ws = DeepSpeedFPParameter(
torch.Size(
(self.num_experts, 2 * self.intermediate_size, self.hidden_size)
),
params_dtype=params_dtype,
quant_config=quant_config,
self.ws = nn.Parameter(
torch.empty(
self.num_experts,
2 * self.intermediate_size,
self.hidden_size,
device=current_platform.device_type,
dtype=self.params_dtype,
)
self.w2s = DeepSpeedFPParameter(
torch.Size(
(self.num_experts, self.hidden_size, self.intermediate_size)
),
params_dtype=params_dtype,
quant_config=quant_config,
)
else:
self.ws = nn.Parameter(
torch.empty(
self.num_experts,
2 * self.intermediate_size,
self.hidden_size,
device=current_platform.device_type,
dtype=self.params_dtype,
)
)
self.w2s = nn.Parameter(
torch.empty(
self.num_experts,
self.hidden_size,
self.intermediate_size,
device=current_platform.device_type,
dtype=self.params_dtype,
)
)
self.w2s = nn.Parameter(
torch.empty(
self.num_experts,
self.hidden_size,
self.intermediate_size,
device=current_platform.device_type,
dtype=self.params_dtype,
)
)
set_weight_attrs(
self.ws,
{
@@ -206,7 +185,7 @@ class ArcticMoE(nn.Module):
expert_id: int,
):
tp_rank = get_tensor_model_parallel_rank()
param_data = param.ds_dequantize() if self.is_quant else param.data
param_data = param.data
shard_size = self.intermediate_size
shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
if weight_name.endswith("w1.weight"):
@@ -217,8 +196,6 @@ class ArcticMoE(nn.Module):
]
if weight_name.endswith("w2.weight"):
param_data[expert_id, :, :] = loaded_weight[:, shard]
if self.is_quant:
param.ds_quantize_(param_data)
def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
num_tokens, hidden_size = hidden_states.shape
@@ -229,26 +206,10 @@ class ArcticMoE(nn.Module):
topk_weights, topk_ids, token_expert_indices = fused_topk(
hidden_states, router_logits, self.top_k, renormalize=do_normalize
)
# topk_ids: (num_tokens, k)
if self.is_quant:
if 2 * num_tokens <= self.num_experts:
# If much fewer tokens than experts, use selective dequantize.
ws_dequantized = self.ws.ds_selective_dequantize(topk_ids.flatten())
w2s_dequantized = self.w2s.ds_selective_dequantize(topk_ids.flatten())
# We gathered the experts to the tokens so update the mapping.
topk_ids = torch.arange(
0,
topk_ids.numel(),
device=topk_ids.device,
).reshape(topk_ids.shape)
else:
ws_dequantized = self.ws.ds_dequantize()
w2s_dequantized = self.w2s.ds_dequantize()
final_hidden_states = fused_experts(
hidden_states,
ws_dequantized if self.is_quant else self.ws,
w2s_dequantized if self.is_quant else self.w2s,
self.ws,
self.w2s,
topk_weights,
topk_ids,
inplace=True,