"refactor: refactor_repeated_interfaces" (#32486)

Signed-off-by: tom-zju <tanjianpingzju1990@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
tjp_zju
2026-01-18 22:07:01 +08:00
committed by GitHub
parent 38bf2ffb21
commit 2f03035a61
11 changed files with 43 additions and 77 deletions

View File

@@ -12,6 +12,7 @@ from vllm.scalar_type import ScalarType
from vllm.utils.flashinfer import ( from vllm.utils.flashinfer import (
flashinfer_quant_nvfp4_8x4_sf_layout, flashinfer_quant_nvfp4_8x4_sf_layout,
) )
from vllm.utils.math_utils import cdiv
logger = init_logger(__name__) logger = init_logger(__name__)
@@ -3111,10 +3112,6 @@ def matmul_ada_mxf4_bf16_tn(
return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha) return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
def ceil_div(a, b):
return (a + b - 1) // b
if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"): if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
@register_fake("_qutlass_C::fusedQuantizeMxQuest") @register_fake("_qutlass_C::fusedQuantizeMxQuest")
@@ -3148,8 +3145,8 @@ def fusedQuantizeMx(
) )
rows, cols = a.numel() // a.size(-1), a.size(-1) // 32 rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
n_row_blocks = ceil_div(rows, 128) n_row_blocks = cdiv(rows, 128)
n_col_blocks = ceil_div(cols, 4) n_col_blocks = cdiv(cols, 4)
padded_rows = n_row_blocks * 128 padded_rows = n_row_blocks * 128
padded_cols = n_col_blocks * 4 padded_cols = n_col_blocks * 4
@@ -3192,8 +3189,8 @@ def fusedQuantizeNv(
) )
rows, cols = a.numel() // a.size(-1), a.size(-1) // 16 rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
n_row_blocks = ceil_div(rows, 128) n_row_blocks = cdiv(rows, 128)
n_col_blocks = ceil_div(cols, 4) n_col_blocks = cdiv(cols, 4)
padded_rows = n_row_blocks * 128 padded_rows = n_row_blocks * 128
padded_cols = n_col_blocks * 4 padded_cols = n_col_blocks * 4
xh_e4m3 = torch.empty( xh_e4m3 = torch.empty(

View File

@@ -32,7 +32,7 @@ def pplx_hidden_dim_scale_bytes(
align = 16 align = 16
# For blocked per token: set to # For blocked per token: set to
# ceil_div(hidden_dim, block_size) * sizeof(float32) # cdiv(hidden_dim, block_size) * sizeof(float32)
# For per-token: set to 4 * sizeof(float32) (x4 for alignment) # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
if quant_dtype is not None: if quant_dtype is not None:
assert isinstance(quant_dtype, torch.dtype) assert isinstance(quant_dtype, torch.dtype)

View File

@@ -248,10 +248,6 @@ class FPQuantLinearMethod(LinearMethodBase):
) )
def ceil_div(a, b):
return (a + b - 1) // b
def fused_quantize_mx( def fused_quantize_mx(
x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:

View File

@@ -17,6 +17,7 @@ import torch
from torch.library import wrap_triton from torch.library import wrap_triton
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils.math_utils import cdiv
@triton.jit @triton.jit
@@ -141,10 +142,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
return out return out
def ceil_div(a, b):
return (a + b - 1) // b
def to_blocked( def to_blocked(
input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton" input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
) -> torch.Tensor: ) -> torch.Tensor:
@@ -160,7 +157,7 @@ def to_blocked(
backend: "torch" (PyTorch path) or "triton" (Triton kernel) backend: "torch" (PyTorch path) or "triton" (Triton kernel)
Returns: Returns:
Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4)) Rearranged tensor of shape (32*cdiv(H,128), 16*cdiv(W,4))
""" """
if backend == "triton": if backend == "triton":
return triton_mx_block_rearrange(input_matrix).flatten() return triton_mx_block_rearrange(input_matrix).flatten()
@@ -168,8 +165,8 @@ def to_blocked(
raise ValueError(f'backend must be "torch" or "triton", got {backend!r}') raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')
rows, cols = input_matrix.shape rows, cols = input_matrix.shape
n_row_blocks = ceil_div(rows, 128) n_row_blocks = cdiv(rows, 128)
n_col_blocks = ceil_div(cols, 4) n_col_blocks = cdiv(cols, 4)
# Calculate the padded shape # Calculate the padded shape
padded_rows = n_row_blocks * 128 padded_rows = n_row_blocks * 128

View File

@@ -22,6 +22,7 @@ from vllm.distributed import (
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.utils import is_moe_model
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
LinearBase, LinearBase,
@@ -52,11 +53,6 @@ from vllm.utils.torch_utils import set_default_torch_dtype
logger = init_logger(__name__) logger = init_logger(__name__)
def is_moe_model(model: torch.nn.Module) -> bool:
"""Checks if the model contains FusedMoE layers."""
return bool(any(isinstance(module, FusedMoE) for module in model.modules()))
class BitsAndBytesModelLoader(BaseModelLoader): class BitsAndBytesModelLoader(BaseModelLoader):
"""Model loader to load model weights with BitsAndBytes quantization.""" """Model loader to load model weights with BitsAndBytes quantization."""

View File

@@ -40,6 +40,7 @@ from .siglip import SiglipVisionModel
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
WeightsMapper, WeightsMapper,
get_layer_index,
init_vllm_registered_model, init_vllm_registered_model,
maybe_prefix, maybe_prefix,
) )
@@ -300,21 +301,15 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer # If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int): if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers) return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest m # If we have multiple feature layers, initialize up to the deepest m
elif isinstance(feature_layers, (list, tuple)): elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError( raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported" f"vision_layer_feature type: {type(feature_layers)} is not supported"
) )
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
AyaVisionMultiModalProcessor, AyaVisionMultiModalProcessor,
info=AyaVisionProcessingInfo, info=AyaVisionProcessingInfo,

View File

@@ -63,6 +63,7 @@ from .siglip import SiglipVisionModel
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
WeightsMapper, WeightsMapper,
get_layer_index,
init_vllm_registered_model, init_vllm_registered_model,
maybe_prefix, maybe_prefix,
) )
@@ -442,29 +443,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer # If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int): if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers) return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest one # If we have multiple feature layers, initialize up to the deepest one
elif isinstance(feature_layers, (list, tuple)): elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError( raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported" f"vision_layer_feature type: {type(feature_layers)} is not supported"
) )
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual
encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
def init_vision_tower_for_llava( def init_vision_tower_for_llava(
hf_config: LlavaLikeConfig, hf_config: LlavaLikeConfig,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,

View File

@@ -52,6 +52,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
WeightsMapper, WeightsMapper,
get_layer_index,
init_vllm_registered_model, init_vllm_registered_model,
maybe_prefix, maybe_prefix,
) )
@@ -369,29 +370,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
num_hidden_layers = hf_config.vision_config.num_hidden_layers num_hidden_layers = hf_config.vision_config.num_hidden_layers
# If we have one feature layer, initialize up to that layer # If we have one feature layer, initialize up to that layer
if isinstance(feature_layers, int): if isinstance(feature_layers, int):
return _get_layer_index(feature_layers, num_hidden_layers) return get_layer_index(feature_layers, num_hidden_layers)
# If we have multiple feature layers, initialize up to the deepest one # If we have multiple feature layers, initialize up to the deepest one
elif isinstance(feature_layers, (list, tuple)): elif isinstance(feature_layers, (list, tuple)):
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers) return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
raise TypeError( raise TypeError(
f"vision_layer_feature type: {type(feature_layers)} is not supported" f"vision_layer_feature type: {type(feature_layers)} is not supported"
) )
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual
encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index
def init_vision_tower_for_llava( def init_vision_tower_for_llava(
hf_config: LlavaLikeConfig, hf_config: LlavaLikeConfig,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,

View File

@@ -47,7 +47,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .clip import CLIPVisionModel from .clip import CLIPVisionModel
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .siglip import SiglipVisionModel from .siglip import SiglipVisionModel
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import (
AutoWeightsLoader,
get_layer_index,
init_vllm_registered_model,
maybe_prefix,
)
from .vision import ( from .vision import (
VisionEncoderInfo, VisionEncoderInfo,
get_num_selected_vision_tokens, get_num_selected_vision_tokens,
@@ -356,18 +361,13 @@ def init_vision_tower_for_tarsier(
feature_layers = hf_config.vision_feature_layer feature_layers = hf_config.vision_feature_layer
base_num_hidden_layers = vision_config.num_hidden_layers base_num_hidden_layers = vision_config.num_hidden_layers
def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int:
if feature_layer_index < 0:
return num_hidden_layers_total + feature_layer_index + 1
return feature_layer_index
if isinstance(feature_layers, int): if isinstance(feature_layers, int):
num_hidden_layers_to_init = _get_layer_index( num_hidden_layers_to_init = get_layer_index(
feature_layers, base_num_hidden_layers feature_layers, base_num_hidden_layers
) )
elif isinstance(feature_layers, (list, tuple)): elif isinstance(feature_layers, (list, tuple)):
num_hidden_layers_to_init = max( num_hidden_layers_to_init = max(
_get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
) )
else: else:
raise TypeError( raise TypeError(

View File

@@ -830,3 +830,16 @@ def process_eagle_weight(
model.has_own_lm_head = True model.has_own_lm_head = True
if "embed_tokens" in name: if "embed_tokens" in name:
model.has_own_embed_tokens = True model.has_own_embed_tokens = True
def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""Given a signed vision feature layer, get the number of hidden layers
needed to leverage it.
Args:
feature_layer_index: Index of a required layer in the visual encoder.
num_hidden_layers: The total number of hidden layers in the visual encoder.
"""
if feature_layer_index < 0:
return num_hidden_layers + feature_layer_index + 1
return feature_layer_index

View File

@@ -25,6 +25,7 @@ from vllm.utils.deep_gemm import (
get_mk_alignment_for_contiguous_layout, get_mk_alignment_for_contiguous_layout,
m_grouped_fp8_gemm_nt_contiguous, m_grouped_fp8_gemm_nt_contiguous,
) )
from vllm.utils.math_utils import cdiv
def _generate_optimal_warmup_m_values( def _generate_optimal_warmup_m_values(
@@ -40,9 +41,6 @@ def _generate_optimal_warmup_m_values(
device: The torch device to get properties from. device: The torch device to get properties from.
""" """
def ceil_div(a: int, b: int) -> int:
return (a + b - 1) // b
# DeepGEMM's possible block sizes # DeepGEMM's possible block sizes
block_ms = [64, 128, 256] block_ms = [64, 128, 256]
block_ns = list(range(16, min(257, n + 1), 16)) block_ns = list(range(16, min(257, n + 1), 16))
@@ -63,7 +61,7 @@ def _generate_optimal_warmup_m_values(
for wave in range(1, 11): # Up to 10 waves for wave in range(1, 11): # Up to 10 waves
# M where this block config transitions to next wave # M where this block config transitions to next wave
target_blocks = wave * num_sms target_blocks = wave * num_sms
m = target_blocks * block_m // ceil_div(n, block_n) m = target_blocks * block_m // cdiv(n, block_n)
if 1 <= m <= max_tokens: if 1 <= m <= max_tokens:
m_values.add(m) m_values.add(m)