diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 1f117a4ee..e64cfd1c5 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -7,7 +7,6 @@ import time from typing import Any, ClassVar, Literal, TypeAlias import regex as re -import torch from pydantic import ( BaseModel, ConfigDict, @@ -25,8 +24,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname logger = init_logger(__name__) -_LONG_INFO = torch.iinfo(torch.long) - class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py index 081f53eac..0df9748b0 100644 --- a/vllm/model_executor/layers/quantization/utils/petit_utils.py +++ b/vllm/model_executor/layers/quantization/utils/petit_utils.py @@ -38,10 +38,6 @@ def _import_petit_kernel() -> "ModuleType": raise ImportError(_PETIT_INSTALL_MSG) from None -# The _require_petit function can now be a simple alias for consistency. -_require_petit = _import_petit_kernel - - def _check_petit_nvfp4_supported( quant_method: str, group_size: int | None ) -> tuple[bool, str | None]: diff --git a/vllm/model_executor/models/glmasr_utils.py b/vllm/model_executor/models/glmasr_utils.py index 492e4b354..80c903da7 100644 --- a/vllm/model_executor/models/glmasr_utils.py +++ b/vllm/model_executor/models/glmasr_utils.py @@ -166,23 +166,3 @@ def _extract_mask_for_item( return feature_attention_mask[start_idx:end_idx] mask_slice = feature_attention_mask[start_idx:end_idx] return _normalize_to_tensor(mask_slice) - - -def _get_num_features_for_item( - feature_attention_mask: torch.Tensor | None, - chunk_counts: torch.Tensor | list[int] | None, - item_idx: int, - audio_embeds: list[torch.Tensor] | None, - merge_factor: int, - conv_params: list[tuple[int, int, int]], -) -> int: - """Get number of features for a specific audio item.""" - if feature_attention_mask is not None: - mask = _extract_mask_for_item(feature_attention_mask, chunk_counts, item_idx) - audio_output_lengths = _get_audio_output_lengths_from_mask( - mask, merge_factor, conv_params - ) - return audio_output_lengths.sum().item() - if audio_embeds is not None: - return audio_embeds[item_idx].shape[0] - raise ValueError("Either feature_attention_mask or audio_embeds must be provided") diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 493fdb465..81f20039b 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -33,8 +33,6 @@ from vllm.model_executor.models.phi4mm_utils import ( unfold_tensor, ) -_AUDIO_PLACEHOLDER_TOKEN_ID = 200011 # <|endoftext11|> - class ConformerEncoderLayer(nn.Module): """ConformerEncoder Layer module. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 592c6685d..207dbd25e 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -48,7 +48,6 @@ _ROCM_UNSUPPORTED_MODELS: list[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. -_ROCM_SWA_REASON = () _ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {} _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = { "0x74a0": "AMD_Instinct_MI300A",