Update pre-commit hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-01-28 00:23:08 +00:00
committed by GitHub
parent 6116ca8cd7
commit 823ab79633
64 changed files with 322 additions and 288 deletions

View File

@@ -73,12 +73,12 @@ class MPLinearKernel(ABC):
torch.nn.Parameter(new_param.data, requires_grad=False))
def _get_weight_params(
self, layer: torch.nn.Module
) -> Tuple[torch.Tensor, # w_q
torch.Tensor, # w_s
Optional[torch.Tensor], # w_zp,
Optional[torch.Tensor] # w_gidx
]:
self, layer: torch.nn.Module) -> Tuple[
torch.Tensor, # w_q
torch.Tensor, # w_s
Optional[torch.Tensor], # w_zp,
Optional[torch.Tensor] # w_gidx
]:
return (
getattr(layer, self.w_q_name),
getattr(layer, self.w_s_name),

View File

@@ -48,13 +48,13 @@ class ScaledMMLinearKernel(ABC):
raise NotImplementedError
def _get_weight_params(
self, layer: torch.nn.Module
) -> Tuple[torch.Tensor, # weight
torch.Tensor, # weight_scale
Optional[torch.Tensor], # input_scale,
Optional[torch.Tensor], # input_zp
Optional[torch.Tensor], # azp_adj
]:
self, layer: torch.nn.Module) -> Tuple[
torch.Tensor, # weight
torch.Tensor, # weight_scale
Optional[torch.Tensor], # input_scale,
Optional[torch.Tensor], # input_zp
Optional[torch.Tensor], # azp_adj
]:
return (
getattr(layer, self.w_q_name),
getattr(layer, self.w_s_name),

View File

@@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
x_dq_block = x_q_block.to(torch.float32)
x_dq_block_tiles = [[
x_dq_block[j * block_n:min((j + 1) * block_n, n),
i * block_k:min((i + 1) * block_k, k), ]
for i in range(k_tiles)
x_dq_block[
j * block_n:min((j + 1) * block_n, n),
i * block_k:min((i + 1) * block_k, k),
] for i in range(k_tiles)
] for j in range(n_tiles)]
for i in range(k_tiles):

View File

@@ -73,8 +73,8 @@ def requantize_with_max_scale(
# from disk in this case. Skip requantization in this case (since)
# we already are quantized with the single scale.
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
torch.float8_e4m3fn).min)
unfused_module_in_checkpoint = (weight_scale[-1]
> torch.finfo(torch.float8_e4m3fn).min)
# If unfused checkpoint, need requanize with the single scale.
if unfused_module_in_checkpoint:

View File

@@ -716,9 +716,10 @@ def _sample_with_torch(
tensors required for Pythonization
'''
categorized_seq_group_ids: Dict[SamplingType,
List[int]] = {t: []
for t in SamplingType}
categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
t: []
for t in SamplingType
}
categorized_sample_indices = sampling_metadata.categorized_sample_indices
for i, seq_group in enumerate(sampling_metadata.seq_groups):
sampling_params = seq_group.sampling_params

View File

@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
def __post_init__(self):
# sanity checks
assert (self.padded_org_vocab_start_index <=
self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index <=
self.padded_added_vocab_end_index)
assert (self.padded_org_vocab_start_index
<= self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index
<= self.padded_added_vocab_end_index)
assert self.org_vocab_start_index <= self.org_vocab_end_index
assert self.added_vocab_start_index <= self.added_vocab_end_index
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
assert (self.added_vocab_start_index <=
self.padded_added_vocab_start_index)
assert (self.added_vocab_start_index
<= self.padded_added_vocab_start_index)
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
org_vocab_end_index)
org_vocab_mask = (input_ >= org_vocab_start_index) & (
input_ < org_vocab_end_index)
added_vocab_mask = (input_ >= added_vocab_start_index) & (
input_ < added_vocab_end_index)
added_offset = added_vocab_start_index - (

View File

@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# from being incorrectly identified as being present in
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
shard_pos = quant_param_name.find(shard_name)
can_correct_rename = (shard_pos > 0) and (
quant_param_name[shard_pos - 1] == ".")
can_correct_rename = (shard_pos
> 0) and (quant_param_name[shard_pos - 1]
== ".")
# If the quant_param_name is packed, it won't occur in the
# param_dict before renaming.
new_quant_param_name = quant_param_name.replace(

View File

@@ -298,8 +298,8 @@ class TensorizerAgent:
to allow for adapter added tokens."""
for child in self.model.modules():
if (isinstance(child, VocabParallelEmbedding)
and child.weight.shape[0] <
child.num_embeddings_per_partition):
and child.weight.shape[0]
< child.num_embeddings_per_partition):
new_weight = torch.empty(child.num_embeddings_per_partition,
child.embedding_dim,
dtype=child.weight.dtype,

View File

@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights."""
from functools import lru_cache
from functools import cache
from typing import Iterable, List, Optional, Set, Tuple, Union
import torch
@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
logger = init_logger(__name__)
@lru_cache(maxsize=None)
@cache
def _get_gemma_act_fn(
hidden_act: Optional[str],
hidden_activation: Optional[str],

View File

@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)):
w1_name = n.replace(
'.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w1.weight" % e)
f".block_sparse_moe.experts.{e}.w1.weight")
w3_name = n.replace(
'.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w3.weight" % e)
f".block_sparse_moe.experts.{e}.w3.weight")
w1_param, w3_param = p[e].chunk(2, dim=0)
assert w1_name not in new_weights
assert w3_name not in new_weights
@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)):
w2_name = n.replace(
'.block_sparse_moe.output_linear.weight',
".block_sparse_moe.experts.%d.w2.weight" % e)
f".block_sparse_moe.experts.{e}.w2.weight")
w2_param = p[e]
assert w2_name not in new_weights
new_weights[w2_name] = w2_param

View File

@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
# For 1) text-only prefill and decode, 2) image-present decode.
if image_inputs is None:
full_text_row_masked_out_mask = (
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
input_ids.device)
attn_metadata.encoder_seq_lens_tensor
!= 0).reshape(-1, 1).to(input_ids.device)
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
# For image-present prefill.

View File

@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
if self.tie_weights:
assert (
self.n_predict >
1), "You cannot tie weights between stages when only 1 exists"
self.n_predict > 1
), "You cannot tie weights between stages when only 1 exists"
embedding = VocabParallelEmbedding(
config.vocab_size,
self.inner_dim,

View File

@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
# compute mask for sparsity
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = (
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
mask_logits_threshold = ((mask_logits_threshold - scores) /
factor) > (2 * jitter_eps)
# apply mask
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = (
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
mask_logits_threshold = ((mask_logits_threshold - scores) /
factor) > (2 * jitter_eps)
# apply mask
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,

View File

@@ -462,7 +462,8 @@ class _ModelRegistry:
ModelRegistry = _ModelRegistry({
model_arch: _LazyRegisteredModel(
model_arch:
_LazyRegisteredModel(
module_name=f"vllm.model_executor.models.{mod_relname}",
class_name=cls_name,
)

View File

@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
return hidden_states
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
info=UltravoxProcessingInfo,
dummy_inputs=UltravoxDummyInputsBuilder
)
@MULTIMODAL_REGISTRY.register_processor(
UltravoxMultiModalProcessor,
info=UltravoxProcessingInfo,
dummy_inputs=UltravoxDummyInputsBuilder)
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper(

View File

@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
device: torch.device,
) -> IntermediateTensors:
return IntermediateTensors({
key: torch.zeros((batch_size, hidden_size),
dtype=dtype,
device=device)
key:
torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
for key in keys
})

View File

@@ -166,7 +166,8 @@ class SamplingMetadata:
pin_memory=pin_memory,
)
categorized_sample_indices = {
t: async_tensor_h2d(
t:
async_tensor_h2d(
seq_ids,
dtype=torch.int,
target_device=device,
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
device: str,
generators: Optional[Dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None,
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
List[int]], int, ]:
) -> Tuple[
List[SequenceGroupToSample],
List[int],
Dict[SamplingType, List[int]],
int,
]:
"""Prepare sequence groups and indices for sampling.
Args: