diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 267b242d5..5c777221b 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -12,6 +12,7 @@ from vllm.scalar_type import ScalarType from vllm.utils.flashinfer import ( flashinfer_quant_nvfp4_8x4_sf_layout, ) +from vllm.utils.math_utils import cdiv logger = init_logger(__name__) @@ -3111,10 +3112,6 @@ def matmul_ada_mxf4_bf16_tn( return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha) -def ceil_div(a, b): - return (a + b - 1) // b - - if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"): @register_fake("_qutlass_C::fusedQuantizeMxQuest") @@ -3148,8 +3145,8 @@ def fusedQuantizeMx( ) rows, cols = a.numel() // a.size(-1), a.size(-1) // 32 - n_row_blocks = ceil_div(rows, 128) - n_col_blocks = ceil_div(cols, 4) + n_row_blocks = cdiv(rows, 128) + n_col_blocks = cdiv(cols, 4) padded_rows = n_row_blocks * 128 padded_cols = n_col_blocks * 4 @@ -3192,8 +3189,8 @@ def fusedQuantizeNv( ) rows, cols = a.numel() // a.size(-1), a.size(-1) // 16 - n_row_blocks = ceil_div(rows, 128) - n_col_blocks = ceil_div(cols, 4) + n_row_blocks = cdiv(rows, 128) + n_col_blocks = cdiv(cols, 4) padded_rows = n_row_blocks * 128 padded_cols = n_col_blocks * 4 xh_e4m3 = torch.empty( diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 2766a2c22..8c4da9711 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -32,7 +32,7 @@ def pplx_hidden_dim_scale_bytes( align = 16 # For blocked per token: set to - # ceil_div(hidden_dim, block_size) * sizeof(float32) + # cdiv(hidden_dim, block_size) * sizeof(float32) # For per-token: set to 4 * sizeof(float32) (x4 for alignment) if quant_dtype is not None: assert isinstance(quant_dtype, torch.dtype) diff --git a/vllm/model_executor/layers/quantization/fp_quant.py b/vllm/model_executor/layers/quantization/fp_quant.py index 15a253cef..4ed8d57dd 100644 --- a/vllm/model_executor/layers/quantization/fp_quant.py +++ b/vllm/model_executor/layers/quantization/fp_quant.py @@ -248,10 +248,6 @@ class FPQuantLinearMethod(LinearMethodBase): ) -def ceil_div(a, b): - return (a + b - 1) // b - - def fused_quantize_mx( x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str ) -> tuple[torch.Tensor, torch.Tensor]: diff --git a/vllm/model_executor/layers/quantization/qutlass_utils.py b/vllm/model_executor/layers/quantization/qutlass_utils.py index 555bb50da..315ecd0c0 100644 --- a/vllm/model_executor/layers/quantization/qutlass_utils.py +++ b/vllm/model_executor/layers/quantization/qutlass_utils.py @@ -17,6 +17,7 @@ import torch from torch.library import wrap_triton from vllm.triton_utils import tl, triton +from vllm.utils.math_utils import cdiv @triton.jit @@ -141,10 +142,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor: return out -def ceil_div(a, b): - return (a + b - 1) // b - - def to_blocked( input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton" ) -> torch.Tensor: @@ -160,7 +157,7 @@ def to_blocked( backend: "torch" (PyTorch path) or "triton" (Triton kernel) Returns: - Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4)) + Rearranged tensor of shape (32*cdiv(H,128), 16*cdiv(W,4)) """ if backend == "triton": return triton_mx_block_rearrange(input_matrix).flatten() @@ -168,8 +165,8 @@ def to_blocked( raise ValueError(f'backend must be "torch" or "triton", got {backend!r}') rows, cols = input_matrix.shape - n_row_blocks = ceil_div(rows, 128) - n_col_blocks = ceil_div(cols, 4) + n_row_blocks = cdiv(rows, 128) + n_col_blocks = cdiv(cols, 4) # Calculate the padded shape padded_rows = n_row_blocks * 128 diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index aa0206450..40b33cdc5 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -22,6 +22,7 @@ from vllm.distributed import ( get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.lora.utils import is_moe_model from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, @@ -52,11 +53,6 @@ from vllm.utils.torch_utils import set_default_torch_dtype logger = init_logger(__name__) -def is_moe_model(model: torch.nn.Module) -> bool: - """Checks if the model contains FusedMoE layers.""" - return bool(any(isinstance(module, FusedMoE) for module in model.modules())) - - class BitsAndBytesModelLoader(BaseModelLoader): """Model loader to load model weights with BitsAndBytes quantization.""" diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 6b203cb4e..be27fc44e 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -40,6 +40,7 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, + get_layer_index, init_vllm_registered_model, maybe_prefix, ) @@ -300,21 +301,15 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int: num_hidden_layers = hf_config.vision_config.num_hidden_layers # If we have one feature layer, initialize up to that layer if isinstance(feature_layers, int): - return _get_layer_index(feature_layers, num_hidden_layers) + return get_layer_index(feature_layers, num_hidden_layers) # If we have multiple feature layers, initialize up to the deepest m elif isinstance(feature_layers, (list, tuple)): - return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) + return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers) raise TypeError( f"vision_layer_feature type: {type(feature_layers)} is not supported" ) -def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: - if feature_layer_index < 0: - return num_hidden_layers + feature_layer_index + 1 - return feature_layer_index - - @MULTIMODAL_REGISTRY.register_processor( AyaVisionMultiModalProcessor, info=AyaVisionProcessingInfo, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 44780b4ca..a243ee30a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -63,6 +63,7 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, + get_layer_index, init_vllm_registered_model, maybe_prefix, ) @@ -442,29 +443,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: num_hidden_layers = hf_config.vision_config.num_hidden_layers # If we have one feature layer, initialize up to that layer if isinstance(feature_layers, int): - return _get_layer_index(feature_layers, num_hidden_layers) + return get_layer_index(feature_layers, num_hidden_layers) # If we have multiple feature layers, initialize up to the deepest one elif isinstance(feature_layers, (list, tuple)): - return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) + return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers) raise TypeError( f"vision_layer_feature type: {type(feature_layers)} is not supported" ) -def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: - """Given a signed vision feature layer, get the number of hidden layers - needed to leverage it. - - Args: - feature_layer_index: Index of a required layer in the visual encoder. - num_hidden_layers: The total number of hidden layers in the visual - encoder. - """ - if feature_layer_index < 0: - return num_hidden_layers + feature_layer_index + 1 - return feature_layer_index - - def init_vision_tower_for_llava( hf_config: LlavaLikeConfig, quant_config: QuantizationConfig | None, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 6c4418aa6..47bde285c 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -52,6 +52,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, + get_layer_index, init_vllm_registered_model, maybe_prefix, ) @@ -369,29 +370,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: num_hidden_layers = hf_config.vision_config.num_hidden_layers # If we have one feature layer, initialize up to that layer if isinstance(feature_layers, int): - return _get_layer_index(feature_layers, num_hidden_layers) + return get_layer_index(feature_layers, num_hidden_layers) # If we have multiple feature layers, initialize up to the deepest one elif isinstance(feature_layers, (list, tuple)): - return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) + return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers) raise TypeError( f"vision_layer_feature type: {type(feature_layers)} is not supported" ) -def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: - """Given a signed vision feature layer, get the number of hidden layers - needed to leverage it. - - Args: - feature_layer_index: Index of a required layer in the visual encoder. - num_hidden_layers: The total number of hidden layers in the visual - encoder. - """ - if feature_layer_index < 0: - return num_hidden_layers + feature_layer_index + 1 - return feature_layer_index - - def init_vision_tower_for_llava( hf_config: LlavaLikeConfig, quant_config: QuantizationConfig | None, diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 9f9bb7362..935d2575a 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -47,7 +47,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix +from .utils import ( + AutoWeightsLoader, + get_layer_index, + init_vllm_registered_model, + maybe_prefix, +) from .vision import ( VisionEncoderInfo, get_num_selected_vision_tokens, @@ -356,18 +361,13 @@ def init_vision_tower_for_tarsier( feature_layers = hf_config.vision_feature_layer base_num_hidden_layers = vision_config.num_hidden_layers - def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int: - if feature_layer_index < 0: - return num_hidden_layers_total + feature_layer_index + 1 - return feature_layer_index - if isinstance(feature_layers, int): - num_hidden_layers_to_init = _get_layer_index( + num_hidden_layers_to_init = get_layer_index( feature_layers, base_num_hidden_layers ) elif isinstance(feature_layers, (list, tuple)): num_hidden_layers_to_init = max( - _get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers + get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers ) else: raise TypeError( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index f25ab9153..a91d99b4f 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -830,3 +830,16 @@ def process_eagle_weight( model.has_own_lm_head = True if "embed_tokens" in name: model.has_own_embed_tokens = True + + +def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: + """Given a signed vision feature layer, get the number of hidden layers + needed to leverage it. + + Args: + feature_layer_index: Index of a required layer in the visual encoder. + num_hidden_layers: The total number of hidden layers in the visual encoder. + """ + if feature_layer_index < 0: + return num_hidden_layers + feature_layer_index + 1 + return feature_layer_index diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 2bbc655bd..212a725d4 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -25,6 +25,7 @@ from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, m_grouped_fp8_gemm_nt_contiguous, ) +from vllm.utils.math_utils import cdiv def _generate_optimal_warmup_m_values( @@ -40,9 +41,6 @@ def _generate_optimal_warmup_m_values( device: The torch device to get properties from. """ - def ceil_div(a: int, b: int) -> int: - return (a + b - 1) // b - # DeepGEMM's possible block sizes block_ms = [64, 128, 256] block_ns = list(range(16, min(257, n + 1), 16)) @@ -63,7 +61,7 @@ def _generate_optimal_warmup_m_values( for wave in range(1, 11): # Up to 10 waves # M where this block config transitions to next wave target_blocks = wave * num_sms - m = target_blocks * block_m // ceil_div(n, block_n) + m = target_blocks * block_m // cdiv(n, block_n) if 1 <= m <= max_tokens: m_values.add(m)