diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index a1d4f46aa..ad32abf58 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -284,16 +284,6 @@ def moe_kernel_quantize_input( return A, A_scale -def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: - """ - A permutation routine that works on fp8 types. - """ - if torch.is_floating_point(m) and m.dtype.itemsize == 1: - return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) - else: - return m[idx, ...] - - def normalize_scales_shape(scales: torch.Tensor | None) -> torch.Tensor | None: if scales is not None: if scales.numel() == 1: diff --git a/vllm/model_executor/models/glmasr_utils.py b/vllm/model_executor/models/glmasr_utils.py index ed0551540..8dcfcfa89 100644 --- a/vllm/model_executor/models/glmasr_utils.py +++ b/vllm/model_executor/models/glmasr_utils.py @@ -130,39 +130,3 @@ def _group_audio_embeddings( grouped_embeddings.append(torch.cat(audio_chunks, dim=0)) current_idx += count return tuple(grouped_embeddings) - - -def _normalize_to_tensor(mask: torch.Tensor | list[torch.Tensor]) -> torch.Tensor: - """Convert mask to tensor, handling both list and tensor formats.""" - if isinstance(mask, list): - return ( - torch.stack(mask) - if mask and isinstance(mask[0], torch.Tensor) - else torch.tensor(mask) - ) - return mask - - -def _extract_mask_for_item( - feature_attention_mask: torch.Tensor | list[torch.Tensor], - chunk_counts: torch.Tensor | list[int] | None, - item_idx: int, -) -> torch.Tensor: - """Extract attention mask for a specific audio item.""" - if chunk_counts is None: - # Single item per audio - mask = feature_attention_mask[item_idx] - if isinstance(feature_attention_mask, torch.Tensor): - return mask.unsqueeze(0) - return _normalize_to_tensor(mask) - - # Multiple chunks per audio: calculate slice indices - counts = _as_list_chunk_counts(chunk_counts) - start_idx = sum(counts[:item_idx]) - end_idx = start_idx + counts[item_idx] - - # Extract slice - if isinstance(feature_attention_mask, torch.Tensor): - return feature_attention_mask[start_idx:end_idx] - mask_slice = feature_attention_mask[start_idx:end_idx] - return _normalize_to_tensor(mask_slice)