"refactor: refactor_repeated_interfaces" (#32486)
Signed-off-by: tom-zju <tanjianpingzju1990@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,7 @@ from vllm.scalar_type import ScalarType
|
|||||||
from vllm.utils.flashinfer import (
|
from vllm.utils.flashinfer import (
|
||||||
flashinfer_quant_nvfp4_8x4_sf_layout,
|
flashinfer_quant_nvfp4_8x4_sf_layout,
|
||||||
)
|
)
|
||||||
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@@ -3111,10 +3112,6 @@ def matmul_ada_mxf4_bf16_tn(
|
|||||||
return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
|
return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
|
||||||
|
|
||||||
|
|
||||||
def ceil_div(a, b):
|
|
||||||
return (a + b - 1) // b
|
|
||||||
|
|
||||||
|
|
||||||
if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
|
if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
|
||||||
|
|
||||||
@register_fake("_qutlass_C::fusedQuantizeMxQuest")
|
@register_fake("_qutlass_C::fusedQuantizeMxQuest")
|
||||||
@@ -3148,8 +3145,8 @@ def fusedQuantizeMx(
|
|||||||
)
|
)
|
||||||
|
|
||||||
rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
|
rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
|
||||||
n_row_blocks = ceil_div(rows, 128)
|
n_row_blocks = cdiv(rows, 128)
|
||||||
n_col_blocks = ceil_div(cols, 4)
|
n_col_blocks = cdiv(cols, 4)
|
||||||
padded_rows = n_row_blocks * 128
|
padded_rows = n_row_blocks * 128
|
||||||
padded_cols = n_col_blocks * 4
|
padded_cols = n_col_blocks * 4
|
||||||
|
|
||||||
@@ -3192,8 +3189,8 @@ def fusedQuantizeNv(
|
|||||||
)
|
)
|
||||||
|
|
||||||
rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
|
rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
|
||||||
n_row_blocks = ceil_div(rows, 128)
|
n_row_blocks = cdiv(rows, 128)
|
||||||
n_col_blocks = ceil_div(cols, 4)
|
n_col_blocks = cdiv(cols, 4)
|
||||||
padded_rows = n_row_blocks * 128
|
padded_rows = n_row_blocks * 128
|
||||||
padded_cols = n_col_blocks * 4
|
padded_cols = n_col_blocks * 4
|
||||||
xh_e4m3 = torch.empty(
|
xh_e4m3 = torch.empty(
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def pplx_hidden_dim_scale_bytes(
|
|||||||
align = 16
|
align = 16
|
||||||
|
|
||||||
# For blocked per token: set to
|
# For blocked per token: set to
|
||||||
# ceil_div(hidden_dim, block_size) * sizeof(float32)
|
# cdiv(hidden_dim, block_size) * sizeof(float32)
|
||||||
# For per-token: set to 4 * sizeof(float32) (x4 for alignment)
|
# For per-token: set to 4 * sizeof(float32) (x4 for alignment)
|
||||||
if quant_dtype is not None:
|
if quant_dtype is not None:
|
||||||
assert isinstance(quant_dtype, torch.dtype)
|
assert isinstance(quant_dtype, torch.dtype)
|
||||||
|
|||||||
@@ -248,10 +248,6 @@ class FPQuantLinearMethod(LinearMethodBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def ceil_div(a, b):
|
|
||||||
return (a + b - 1) // b
|
|
||||||
|
|
||||||
|
|
||||||
def fused_quantize_mx(
|
def fused_quantize_mx(
|
||||||
x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
|
x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import torch
|
|||||||
from torch.library import wrap_triton
|
from torch.library import wrap_triton
|
||||||
|
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
|
|
||||||
@triton.jit
|
@triton.jit
|
||||||
@@ -141,10 +142,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def ceil_div(a, b):
|
|
||||||
return (a + b - 1) // b
|
|
||||||
|
|
||||||
|
|
||||||
def to_blocked(
|
def to_blocked(
|
||||||
input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
|
input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton"
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
@@ -160,7 +157,7 @@ def to_blocked(
|
|||||||
backend: "torch" (PyTorch path) or "triton" (Triton kernel)
|
backend: "torch" (PyTorch path) or "triton" (Triton kernel)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
|
Rearranged tensor of shape (32*cdiv(H,128), 16*cdiv(W,4))
|
||||||
"""
|
"""
|
||||||
if backend == "triton":
|
if backend == "triton":
|
||||||
return triton_mx_block_rearrange(input_matrix).flatten()
|
return triton_mx_block_rearrange(input_matrix).flatten()
|
||||||
@@ -168,8 +165,8 @@ def to_blocked(
|
|||||||
raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')
|
raise ValueError(f'backend must be "torch" or "triton", got {backend!r}')
|
||||||
|
|
||||||
rows, cols = input_matrix.shape
|
rows, cols = input_matrix.shape
|
||||||
n_row_blocks = ceil_div(rows, 128)
|
n_row_blocks = cdiv(rows, 128)
|
||||||
n_col_blocks = ceil_div(cols, 4)
|
n_col_blocks = cdiv(cols, 4)
|
||||||
|
|
||||||
# Calculate the padded shape
|
# Calculate the padded shape
|
||||||
padded_rows = n_row_blocks * 128
|
padded_rows = n_row_blocks * 128
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from vllm.distributed import (
|
|||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
)
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.utils import is_moe_model
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.linear import (
|
from vllm.model_executor.layers.linear import (
|
||||||
LinearBase,
|
LinearBase,
|
||||||
@@ -52,11 +53,6 @@ from vllm.utils.torch_utils import set_default_torch_dtype
|
|||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def is_moe_model(model: torch.nn.Module) -> bool:
|
|
||||||
"""Checks if the model contains FusedMoE layers."""
|
|
||||||
return bool(any(isinstance(module, FusedMoE) for module in model.modules()))
|
|
||||||
|
|
||||||
|
|
||||||
class BitsAndBytesModelLoader(BaseModelLoader):
|
class BitsAndBytesModelLoader(BaseModelLoader):
|
||||||
"""Model loader to load model weights with BitsAndBytes quantization."""
|
"""Model loader to load model weights with BitsAndBytes quantization."""
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from .siglip import SiglipVisionModel
|
|||||||
from .utils import (
|
from .utils import (
|
||||||
AutoWeightsLoader,
|
AutoWeightsLoader,
|
||||||
WeightsMapper,
|
WeightsMapper,
|
||||||
|
get_layer_index,
|
||||||
init_vllm_registered_model,
|
init_vllm_registered_model,
|
||||||
maybe_prefix,
|
maybe_prefix,
|
||||||
)
|
)
|
||||||
@@ -300,21 +301,15 @@ def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
|
|||||||
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
||||||
# If we have one feature layer, initialize up to that layer
|
# If we have one feature layer, initialize up to that layer
|
||||||
if isinstance(feature_layers, int):
|
if isinstance(feature_layers, int):
|
||||||
return _get_layer_index(feature_layers, num_hidden_layers)
|
return get_layer_index(feature_layers, num_hidden_layers)
|
||||||
# If we have multiple feature layers, initialize up to the deepest m
|
# If we have multiple feature layers, initialize up to the deepest m
|
||||||
elif isinstance(feature_layers, (list, tuple)):
|
elif isinstance(feature_layers, (list, tuple)):
|
||||||
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
|
||||||
if feature_layer_index < 0:
|
|
||||||
return num_hidden_layers + feature_layer_index + 1
|
|
||||||
return feature_layer_index
|
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(
|
@MULTIMODAL_REGISTRY.register_processor(
|
||||||
AyaVisionMultiModalProcessor,
|
AyaVisionMultiModalProcessor,
|
||||||
info=AyaVisionProcessingInfo,
|
info=AyaVisionProcessingInfo,
|
||||||
|
|||||||
@@ -63,6 +63,7 @@ from .siglip import SiglipVisionModel
|
|||||||
from .utils import (
|
from .utils import (
|
||||||
AutoWeightsLoader,
|
AutoWeightsLoader,
|
||||||
WeightsMapper,
|
WeightsMapper,
|
||||||
|
get_layer_index,
|
||||||
init_vllm_registered_model,
|
init_vllm_registered_model,
|
||||||
maybe_prefix,
|
maybe_prefix,
|
||||||
)
|
)
|
||||||
@@ -442,29 +443,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
|
|||||||
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
||||||
# If we have one feature layer, initialize up to that layer
|
# If we have one feature layer, initialize up to that layer
|
||||||
if isinstance(feature_layers, int):
|
if isinstance(feature_layers, int):
|
||||||
return _get_layer_index(feature_layers, num_hidden_layers)
|
return get_layer_index(feature_layers, num_hidden_layers)
|
||||||
# If we have multiple feature layers, initialize up to the deepest one
|
# If we have multiple feature layers, initialize up to the deepest one
|
||||||
elif isinstance(feature_layers, (list, tuple)):
|
elif isinstance(feature_layers, (list, tuple)):
|
||||||
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
|
||||||
"""Given a signed vision feature layer, get the number of hidden layers
|
|
||||||
needed to leverage it.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
feature_layer_index: Index of a required layer in the visual encoder.
|
|
||||||
num_hidden_layers: The total number of hidden layers in the visual
|
|
||||||
encoder.
|
|
||||||
"""
|
|
||||||
if feature_layer_index < 0:
|
|
||||||
return num_hidden_layers + feature_layer_index + 1
|
|
||||||
return feature_layer_index
|
|
||||||
|
|
||||||
|
|
||||||
def init_vision_tower_for_llava(
|
def init_vision_tower_for_llava(
|
||||||
hf_config: LlavaLikeConfig,
|
hf_config: LlavaLikeConfig,
|
||||||
quant_config: QuantizationConfig | None,
|
quant_config: QuantizationConfig | None,
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
|
|||||||
from .utils import (
|
from .utils import (
|
||||||
AutoWeightsLoader,
|
AutoWeightsLoader,
|
||||||
WeightsMapper,
|
WeightsMapper,
|
||||||
|
get_layer_index,
|
||||||
init_vllm_registered_model,
|
init_vllm_registered_model,
|
||||||
maybe_prefix,
|
maybe_prefix,
|
||||||
)
|
)
|
||||||
@@ -369,29 +370,15 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
|
|||||||
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
num_hidden_layers = hf_config.vision_config.num_hidden_layers
|
||||||
# If we have one feature layer, initialize up to that layer
|
# If we have one feature layer, initialize up to that layer
|
||||||
if isinstance(feature_layers, int):
|
if isinstance(feature_layers, int):
|
||||||
return _get_layer_index(feature_layers, num_hidden_layers)
|
return get_layer_index(feature_layers, num_hidden_layers)
|
||||||
# If we have multiple feature layers, initialize up to the deepest one
|
# If we have multiple feature layers, initialize up to the deepest one
|
||||||
elif isinstance(feature_layers, (list, tuple)):
|
elif isinstance(feature_layers, (list, tuple)):
|
||||||
return max(_get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
return max(get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
f"vision_layer_feature type: {type(feature_layers)} is not supported"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
|
||||||
"""Given a signed vision feature layer, get the number of hidden layers
|
|
||||||
needed to leverage it.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
feature_layer_index: Index of a required layer in the visual encoder.
|
|
||||||
num_hidden_layers: The total number of hidden layers in the visual
|
|
||||||
encoder.
|
|
||||||
"""
|
|
||||||
if feature_layer_index < 0:
|
|
||||||
return num_hidden_layers + feature_layer_index + 1
|
|
||||||
return feature_layer_index
|
|
||||||
|
|
||||||
|
|
||||||
def init_vision_tower_for_llava(
|
def init_vision_tower_for_llava(
|
||||||
hf_config: LlavaLikeConfig,
|
hf_config: LlavaLikeConfig,
|
||||||
quant_config: QuantizationConfig | None,
|
quant_config: QuantizationConfig | None,
|
||||||
|
|||||||
@@ -47,7 +47,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
|||||||
from .clip import CLIPVisionModel
|
from .clip import CLIPVisionModel
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||||
from .siglip import SiglipVisionModel
|
from .siglip import SiglipVisionModel
|
||||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
from .utils import (
|
||||||
|
AutoWeightsLoader,
|
||||||
|
get_layer_index,
|
||||||
|
init_vllm_registered_model,
|
||||||
|
maybe_prefix,
|
||||||
|
)
|
||||||
from .vision import (
|
from .vision import (
|
||||||
VisionEncoderInfo,
|
VisionEncoderInfo,
|
||||||
get_num_selected_vision_tokens,
|
get_num_selected_vision_tokens,
|
||||||
@@ -356,18 +361,13 @@ def init_vision_tower_for_tarsier(
|
|||||||
feature_layers = hf_config.vision_feature_layer
|
feature_layers = hf_config.vision_feature_layer
|
||||||
base_num_hidden_layers = vision_config.num_hidden_layers
|
base_num_hidden_layers = vision_config.num_hidden_layers
|
||||||
|
|
||||||
def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> int:
|
|
||||||
if feature_layer_index < 0:
|
|
||||||
return num_hidden_layers_total + feature_layer_index + 1
|
|
||||||
return feature_layer_index
|
|
||||||
|
|
||||||
if isinstance(feature_layers, int):
|
if isinstance(feature_layers, int):
|
||||||
num_hidden_layers_to_init = _get_layer_index(
|
num_hidden_layers_to_init = get_layer_index(
|
||||||
feature_layers, base_num_hidden_layers
|
feature_layers, base_num_hidden_layers
|
||||||
)
|
)
|
||||||
elif isinstance(feature_layers, (list, tuple)):
|
elif isinstance(feature_layers, (list, tuple)):
|
||||||
num_hidden_layers_to_init = max(
|
num_hidden_layers_to_init = max(
|
||||||
_get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
|
get_layer_index(idx, base_num_hidden_layers) for idx in feature_layers
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
|
|||||||
@@ -830,3 +830,16 @@ def process_eagle_weight(
|
|||||||
model.has_own_lm_head = True
|
model.has_own_lm_head = True
|
||||||
if "embed_tokens" in name:
|
if "embed_tokens" in name:
|
||||||
model.has_own_embed_tokens = True
|
model.has_own_embed_tokens = True
|
||||||
|
|
||||||
|
|
||||||
|
def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
|
||||||
|
"""Given a signed vision feature layer, get the number of hidden layers
|
||||||
|
needed to leverage it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_layer_index: Index of a required layer in the visual encoder.
|
||||||
|
num_hidden_layers: The total number of hidden layers in the visual encoder.
|
||||||
|
"""
|
||||||
|
if feature_layer_index < 0:
|
||||||
|
return num_hidden_layers + feature_layer_index + 1
|
||||||
|
return feature_layer_index
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from vllm.utils.deep_gemm import (
|
|||||||
get_mk_alignment_for_contiguous_layout,
|
get_mk_alignment_for_contiguous_layout,
|
||||||
m_grouped_fp8_gemm_nt_contiguous,
|
m_grouped_fp8_gemm_nt_contiguous,
|
||||||
)
|
)
|
||||||
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
|
|
||||||
def _generate_optimal_warmup_m_values(
|
def _generate_optimal_warmup_m_values(
|
||||||
@@ -40,9 +41,6 @@ def _generate_optimal_warmup_m_values(
|
|||||||
device: The torch device to get properties from.
|
device: The torch device to get properties from.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def ceil_div(a: int, b: int) -> int:
|
|
||||||
return (a + b - 1) // b
|
|
||||||
|
|
||||||
# DeepGEMM's possible block sizes
|
# DeepGEMM's possible block sizes
|
||||||
block_ms = [64, 128, 256]
|
block_ms = [64, 128, 256]
|
||||||
block_ns = list(range(16, min(257, n + 1), 16))
|
block_ns = list(range(16, min(257, n + 1), 16))
|
||||||
@@ -63,7 +61,7 @@ def _generate_optimal_warmup_m_values(
|
|||||||
for wave in range(1, 11): # Up to 10 waves
|
for wave in range(1, 11): # Up to 10 waves
|
||||||
# M where this block config transitions to next wave
|
# M where this block config transitions to next wave
|
||||||
target_blocks = wave * num_sms
|
target_blocks = wave * num_sms
|
||||||
m = target_blocks * block_m // ceil_div(n, block_n)
|
m = target_blocks * block_m // cdiv(n, block_n)
|
||||||
if 1 <= m <= max_tokens:
|
if 1 <= m <= max_tokens:
|
||||||
m_values.add(m)
|
m_values.add(m)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user