"refactor: refactor_repeated_interfaces" (#32486)

Signed-off-by: tom-zju <tanjianpingzju1990@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-18 22:07:01 +08:00
parent 38bf2ffb21
commit 2f03035a61
11 changed files with 43 additions and 77 deletions
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -12,6 +12,7 @@ from vllm.scalar_type import ScalarType
 from vllm.utils.flashinfer import (
    flashinfer_quant_nvfp4_8x4_sf_layout,
 )
+from vllm.utils.math_utils import cdiv

 logger = init_logger(__name__)

@@ -3111,10 +3112,6 @@ def matmul_ada_mxf4_bf16_tn(
    return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)


-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
 if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):

    @register_fake("_qutlass_C::fusedQuantizeMxQuest")
@@ -3148,8 +3145,8 @@ def fusedQuantizeMx(
    )

    rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
-    n_row_blocks = ceil_div(rows, 128)
-    n_col_blocks = ceil_div(cols, 4)
+    n_row_blocks = cdiv(rows, 128)
+    n_col_blocks = cdiv(cols, 4)
    padded_rows = n_row_blocks * 128
    padded_cols = n_col_blocks * 4

@@ -3192,8 +3189,8 @@ def fusedQuantizeNv(
    )

    rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
-    n_row_blocks = ceil_div(rows, 128)
-    n_col_blocks = ceil_div(cols, 4)
+    n_row_blocks = cdiv(rows, 128)
+    n_col_blocks = cdiv(cols, 4)
    padded_rows = n_row_blocks * 128
    padded_cols = n_col_blocks * 4
    xh_e4m3 = torch.empty(
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -32,7 +32,7 @@ def pplx_hidden_dim_scale_bytes(
    align = 16

    # For blocked per token: set to
-    #   ceil_div(hidden_dim, block_size) * sizeof(float32)
+    #   cdiv(hidden_dim, block_size) * sizeof(float32)
    # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
    if quant_dtype is not None:
        assert isinstance(quant_dtype, torch.dtype)
--- a/vllm/model_executor/layers/quantization/fp_quant.py
+++ b/vllm/model_executor/layers/quantization/fp_quant.py
@@ -248,10 +248,6 @@ class FPQuantLinearMethod(LinearMethodBase):
        )


-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
 def fused_quantize_mx(
    x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
 ) -> tuple[torch.Tensor, torch.Tensor]:
--- a/vllm/model_executor/layers/quantization/qutlass_utils.py
+++ b/vllm/model_executor/layers/quantization/qutlass_utils.py
@@ -17,6 +17,7 @@ import torch
 from torch.library import wrap_triton

 from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import cdiv


@triton.jit
@@ -141,10 +142,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
    return out


-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
 def to_blocked(
    input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
 ) -> torch.Tensor:
@@ -160,7 +157,7 @@ def to_blocked(
        backend: "torch" (PyTorch path) or "triton" (Triton kernel)

    Returns:
-        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
+        Rearranged tensor of shape (32*cdiv(H,128), 16*cdiv(W,4))
    """
    if backend == "triton":
        return triton_mx_block_rearrange(input_matrix).flatten()
@@ -168,8 +165,8 @@ def to_blocked(
        raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')

    rows, cols = input_matrix.shape
-    n_row_blocks = ceil_div(rows, 128)
-    n_col_blocks = ceil_div(cols, 4)
+    n_row_blocks = cdiv(rows, 128)
+    n_col_blocks = cdiv(cols, 4)

    # Calculate the padded shape
    padded_rows = n_row_blocks * 128
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -22,6 +22,7 @@ from vllm.distributed import (
    get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
+from vllm.lora.utils import is_moe_model
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
    LinearBase,
@@ -52,11 +53,6 @@ from vllm.utils.torch_utils import set_default_torch_dtype
 logger = init_logger(__name__)


-def is_moe_model(model: torch.nn.Module) -> bool:
-    """Checks if the model contains FusedMoE layers."""
-    return bool(any(isinstance(module, FusedMoE) for module in model.modules()))
-
-
 class BitsAndBytesModelLoader(BaseModelLoader):
    """Model loader to load model weights with BitsAndBytes quantization."""

--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -40,6 +40,7 @@ from .siglip import SiglipVisionModel
 from .utils import (
    AutoWeightsLoader,
    WeightsMapper,
+    get_layer_index,
    init_vllm_registered_model,
    maybe_prefix,
 )
@@ -300,21 +301,15 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
    num_hidden_layers = hf_config.vision_config.num_hidden_layers
    # If we have one feature layer, initialize up to that layer
    if isinstance(feature_layers, int):
-        return _get_layer_index(feature_layers, num_hidden_layers)
+        return get_layer_index(feature_layers, num_hidden_layers)
    # If we have multiple feature layers, initialize up to the deepest m
    elif isinstance(feature_layers, (list, tuple)):
-        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+        return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
    raise TypeError(
        f"vision_layer_feature type: {type(feature_layers)} is not supported"
    )


-def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
-    if feature_layer_index < 0:
-        return num_hidden_layers + feature_layer_index + 1
-    return feature_layer_index
-
-
@MULTIMODAL_REGISTRY.register_processor(
    AyaVisionMultiModalProcessor,
    info=AyaVisionProcessingInfo,
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -63,6 +63,7 @@ from .siglip import SiglipVisionModel
 from .utils import (
    AutoWeightsLoader,
    WeightsMapper,
+    get_layer_index,
    init_vllm_registered_model,
    maybe_prefix,
 )
@@ -442,29 +443,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
    num_hidden_layers = hf_config.vision_config.num_hidden_layers
    # If we have one feature layer, initialize up to that layer
    if isinstance(feature_layers, int):
-        return _get_layer_index(feature_layers, num_hidden_layers)
+        return get_layer_index(feature_layers, num_hidden_layers)
    # If we have multiple feature layers, initialize up to the deepest one
    elif isinstance(feature_layers, (list, tuple)):
-        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+        return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
    raise TypeError(
        f"vision_layer_feature type: {type(feature_layers)} is not supported"
    )


-def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
-    """Given a signed vision feature layer, get the number of hidden layers
-    needed to leverage it.
-
-    Args:
-        feature_layer_index: Index of a required layer in the visual encoder.
-        num_hidden_layers: The total number of hidden layers in the visual
-            encoder.
-    """
-    if feature_layer_index < 0:
-        return num_hidden_layers + feature_layer_index + 1
-    return feature_layer_index
-
-
 def init_vision_tower_for_llava(
    hf_config: LlavaLikeConfig,
    quant_config: QuantizationConfig | None,
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -52,6 +52,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (
    AutoWeightsLoader,
    WeightsMapper,
+    get_layer_index,
    init_vllm_registered_model,
    maybe_prefix,
 )
@@ -369,29 +370,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
    num_hidden_layers = hf_config.vision_config.num_hidden_layers
    # If we have one feature layer, initialize up to that layer
    if isinstance(feature_layers, int):
-        return _get_layer_index(feature_layers, num_hidden_layers)
+        return get_layer_index(feature_layers, num_hidden_layers)
    # If we have multiple feature layers, initialize up to the deepest one
    elif isinstance(feature_layers, (list, tuple)):
-        return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+        return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
    raise TypeError(
        f"vision_layer_feature type: {type(feature_layers)} is not supported"
    )


-def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
-    """Given a signed vision feature layer, get the number of hidden layers
-    needed to leverage it.
-
-    Args:
-        feature_layer_index: Index of a required layer in the visual encoder.
-        num_hidden_layers: The total number of hidden layers in the visual
-            encoder.
-    """
-    if feature_layer_index < 0:
-        return num_hidden_layers + feature_layer_index + 1
-    return feature_layer_index
-
-
 def init_vision_tower_for_llava(
    hf_config: LlavaLikeConfig,
    quant_config: QuantizationConfig | None,
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -47,7 +47,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    get_layer_index,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
 from .vision import (
    VisionEncoderInfo,
    get_num_selected_vision_tokens,
@@ -356,18 +361,13 @@ def init_vision_tower_for_tarsier(
    feature_layers = hf_config.vision_feature_layer
    base_num_hidden_layers = vision_config.num_hidden_layers

-    def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int:
-        if feature_layer_index < 0:
-            return num_hidden_layers_total + feature_layer_index + 1
-        return feature_layer_index
-
    if isinstance(feature_layers, int):
-        num_hidden_layers_to_init = _get_layer_index(
+        num_hidden_layers_to_init = get_layer_index(
            feature_layers, base_num_hidden_layers
        )
    elif isinstance(feature_layers, (list, tuple)):
        num_hidden_layers_to_init = max(
-            _get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
+            get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
        )
    else:
        raise TypeError(
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -830,3 +830,16 @@ def process_eagle_weight(
        model.has_own_lm_head = True
    if "embed_tokens" in name:
        model.has_own_embed_tokens = True
+
+
+def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+       needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -25,6 +25,7 @@ from vllm.utils.deep_gemm import (
    get_mk_alignment_for_contiguous_layout,
    m_grouped_fp8_gemm_nt_contiguous,
 )
+from vllm.utils.math_utils import cdiv


 def _generate_optimal_warmup_m_values(
@@ -40,9 +41,6 @@ def _generate_optimal_warmup_m_values(
        device: The torch device to get properties from.
    """

-    def ceil_div(a: int, b: int) -> int:
-        return (a + b - 1) // b
-
    # DeepGEMM's possible block sizes
    block_ms = [64, 128, 256]
    block_ns = list(range(16, min(257, n + 1), 16))
@@ -63,7 +61,7 @@ def _generate_optimal_warmup_m_values(
            for wave in range(1, 11):  # Up to 10 waves
                # M where this block config transitions to next wave
                target_blocks = wave * num_sms
-                m = target_blocks * block_m // ceil_div(n, block_n)
+                m = target_blocks * block_m // cdiv(n, block_n)
                if 1 <= m <= max_tokens:
                    m_values.add(m)