Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -6,18 +6,25 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.
Run `pytest tests/quantization/test_register_quantization_config.py`.
"""
from typing import Any, Optional
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.linear import LinearBase # noqa: E501
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.linear import (
LinearBase, # noqa: E501
UnquantizedLinearMethod,
)
from vllm.model_executor.layers.quantization import (
QuantizationMethods, get_quantization_config, register_quantization_config)
QuantizationMethods,
get_quantization_config,
register_quantization_config,
)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig)
QuantizationConfig,
)
class FakeQuantLinearMethod(UnquantizedLinearMethod):
@@ -28,10 +35,12 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
super().__init__()
self.num_bits = num_bits
def apply(self,
layer: "torch.nn.Module",
x: "torch.Tensor",
bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
def apply(
self,
layer: "torch.nn.Module",
x: "torch.Tensor",
bias: Optional["torch.Tensor"] = None,
) -> "torch.Tensor":
"""Perform fake quantization before the linear layer."""
# Calculate the scales dynamically
@@ -40,8 +49,11 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
scales = (max_val - min_val) / (2**self.num_bits - 1)
# Fake quantize the input
quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
2**(self.num_bits - 1) - 1)
quant_x = torch.clamp(
torch.round(x / scales),
-(2 ** (self.num_bits - 1)),
2 ** (self.num_bits - 1) - 1,
)
dequant_x = quant_x * scales
return F.linear(dequant_x, layer.weight, bias)
@@ -79,8 +91,9 @@ class CustomQuantConfig(QuantizationConfig):
"""Create a config class from the model's quantization config."""
return CustomQuantConfig(num_bits=config.get("num_bits", 8))
def get_quant_method(self, layer: "torch.nn.Module",
prefix: str) -> Optional["FakeQuantLinearMethod"]:
def get_quant_method(
self, layer: "torch.nn.Module", prefix: str
) -> Optional["FakeQuantLinearMethod"]:
"""Get the quantize method to use for the quantized layer."""
if isinstance(layer, LinearBase):
return FakeQuantLinearMethod(num_bits=self.num_bits)
@@ -99,18 +112,20 @@ def test_register_quantization_config():
register_quantization_config("custom_quant")(CustomQuantConfig)
@pytest.mark.parametrize(argnames="model",
argvalues=[
"meta-llama/Llama-3.2-1B-Instruct",
])
@pytest.mark.parametrize(
argnames="model",
argvalues=[
"meta-llama/Llama-3.2-1B-Instruct",
],
)
def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method."""
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=model,
quantization="custom_quant",
enforce_eager=True) as llm:
with vllm_runner(
model_name=model, quantization="custom_quant", enforce_eager=True
) as llm:
def check_model(model):
layer = model.model.layers[0]