"refactor: refactor_repeated_interfaces" (#32486)

Signed-off-by: tom-zju <tanjianpingzju1990@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
tjp_zju
2026-01-18 22:07:01 +08:00
committed by GitHub
parent 38bf2ffb21
commit 2f03035a61
11 changed files with 43 additions and 77 deletions

View File

@@ -12,6 +12,7 @@ from vllm.scalar_type import ScalarType
from vllm.utils.flashinfer import (
flashinfer_quant_nvfp4_8x4_sf_layout,
)
from vllm.utils.math_utils import cdiv
logger = init_logger(__name__)
@@ -3111,10 +3112,6 @@ def matmul_ada_mxf4_bf16_tn(
return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
def ceil_div(a, b):
return (a + b - 1) // b
if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
@register_fake("_qutlass_C::fusedQuantizeMxQuest")
@@ -3148,8 +3145,8 @@ def fusedQuantizeMx(
)
rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
n_row_blocks = ceil_div(rows, 128)
n_col_blocks = ceil_div(cols, 4)
n_row_blocks = cdiv(rows, 128)
n_col_blocks = cdiv(cols, 4)
padded_rows = n_row_blocks * 128
padded_cols = n_col_blocks * 4
@@ -3192,8 +3189,8 @@ def fusedQuantizeNv(
)
rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
n_row_blocks = ceil_div(rows, 128)
n_col_blocks = ceil_div(cols, 4)
n_row_blocks = cdiv(rows, 128)
n_col_blocks = cdiv(cols, 4)
padded_rows = n_row_blocks * 128
padded_cols = n_col_blocks * 4
xh_e4m3 = torch.empty(

View File

@@ -32,7 +32,7 @@ def pplx_hidden_dim_scale_bytes(
align = 16
# For blocked per token: set to
# ceil_div(hidden_dim, block_size) * sizeof(float32)
# cdiv(hidden_dim, block_size) * sizeof(float32)
# For per-token: set to 4 * sizeof(float32) (x4 for alignment)
if quant_dtype is not None:
assert isinstance(quant_dtype, torch.dtype)

View File

@@ -248,10 +248,6 @@ class FPQuantLinearMethod(LinearMethodBase):
)
def ceil_div(a, b):
return (a + b - 1) // b
def fused_quantize_mx(
x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
) -> tuple[torch.Tensor, torch.Tensor]:

View File

@@ -17,6 +17,7 @@ import torch
from torch.library import wrap_triton
from vllm.triton_utils import tl, triton
from vllm.utils.math_utils import cdiv
@triton.jit
@@ -141,10 +142,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
return out
def ceil_div(a, b):
return (a + b - 1) // b
def to_blocked(
input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
) -> torch.Tensor:
@@ -160,7 +157,7 @@ def to_blocked(
backend: "torch" (PyTorch path) or "triton" (Triton kernel)
Returns:
Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
Rearranged tensor of shape (32*cdiv(H,128), 16*cdiv(W,4))
"""
if backend == "triton":
return triton_mx_block_rearrange(input_matrix).flatten()
@@ -168,8 +165,8 @@ def to_blocked(
raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')
rows, cols = input_matrix.shape
n_row_blocks = ceil_div(rows, 128)
n_col_blocks = ceil_div(cols, 4)
n_row_blocks = cdiv(rows, 128)
n_col_blocks = cdiv(cols, 4)
# Calculate the padded shape
padded_rows = n_row_blocks * 128

View File

@@ -22,6 +22,7 @@ from vllm.distributed import (
get_tensor_model_parallel_world_size,
)
from vllm.logger import init_logger
from vllm.lora.utils import is_moe_model
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (
LinearBase,
@@ -52,11 +53,6 @@ from vllm.utils.torch_utils import set_default_torch_dtype
logger = init_logger(__name__)
def is_moe_model(model: torch.nn.Module) -> bool:
"""Checks if the model contains FusedMoE layers."""
return bool(any(isinstance(module, FusedMoE) for module in model.modules()))
class BitsAndBytesModelLoader(BaseModelLoader):
"""Model loader to load model weights with BitsAndBytes quantization."""

View File

@@ -40,6 +40,7 @@ from .siglip import SiglipVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
get_layer_index,
init_vllm_registered_model,
maybe_prefix,
)
@@ -300,21 +301,15 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers)
return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest m
elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported"
)
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
@MULTIMODAL_REGISTRY.register_processor(
AyaVisionMultiModalProcessor,
info=AyaVisionProcessingInfo,

View File

@@ -63,6 +63,7 @@ from .siglip import SiglipVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
get_layer_index,
init_vllm_registered_model,
maybe_prefix,
)
@@ -442,29 +443,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers)
return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest one
elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported"
)
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual
encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
def init_vision_tower_for_llava(
hf_config: LlavaLikeConfig,
quant_config: QuantizationConfig | None,

View File

@@ -52,6 +52,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
get_layer_index,
init_vllm_registered_model,
maybe_prefix,
)
@@ -369,29 +370,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers)
return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest one
elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported"
)
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual
encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
def init_vision_tower_for_llava(
hf_config: LlavaLikeConfig,
quant_config: QuantizationConfig | None,

View File

@@ -47,7 +47,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .clip import CLIPVisionModel
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .siglip import SiglipVisionModel
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
from .utils import (
AutoWeightsLoader,
get_layer_index,
init_vllm_registered_model,
maybe_prefix,
)
from .vision import (
VisionEncoderInfo,
get_num_selected_vision_tokens,
@@ -356,18 +361,13 @@ def init_vision_tower_for_tarsier(
feature_layers = hf_config.vision_feature_layer
base_num_hidden_layers = vision_config.num_hidden_layers
def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int:
if feature_layer_index < 0:
return num_hidden_layers_total + feature_layer_index + 1
return feature_layer_index
if isinstance(feature_layers, int):
num_hidden_layers_to_init = _get_layer_index(
num_hidden_layers_to_init = get_layer_index(
feature_layers, base_num_hidden_layers
)
elif isinstance(feature_layers, (list, tuple)):
num_hidden_layers_to_init = max(
_get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
)
else:
raise TypeError(

View File

@@ -830,3 +830,16 @@ def process_eagle_weight(
model.has_own_lm_head = True
if "embed_tokens" in name:
model.has_own_embed_tokens = True
def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index

View File

@@ -25,6 +25,7 @@ from vllm.utils.deep_gemm import (
get_mk_alignment_for_contiguous_layout,
m_grouped_fp8_gemm_nt_contiguous,
)
from vllm.utils.math_utils import cdiv
def _generate_optimal_warmup_m_values(
@@ -40,9 +41,6 @@ def _generate_optimal_warmup_m_values(
device: The torch device to get properties from.
"""
def ceil_div(a: int, b: int) -> int:
return (a + b - 1) // b
# DeepGEMM's possible block sizes
block_ms = [64, 128, 256]
block_ns = list(range(16, min(257, n + 1), 16))
@@ -63,7 +61,7 @@ def _generate_optimal_warmup_m_values(
for wave in range(1, 11): # Up to 10 waves
# M where this block config transitions to next wave
target_blocks = wave * num_sms
m = target_blocks * block_m // ceil_div(n, block_n)
m = target_blocks * block_m // cdiv(n, block_n)
if 1 <= m <= max_tokens:
m_values.add(m)