Update pre-commit hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -73,12 +73,12 @@ class MPLinearKernel(ABC):
|
||||
torch.nn.Parameter(new_param.data, requires_grad=False))
|
||||
|
||||
def _get_weight_params(
|
||||
self, layer: torch.nn.Module
|
||||
) -> Tuple[torch.Tensor, # w_q
|
||||
torch.Tensor, # w_s
|
||||
Optional[torch.Tensor], # w_zp,
|
||||
Optional[torch.Tensor] # w_gidx
|
||||
]:
|
||||
self, layer: torch.nn.Module) -> Tuple[
|
||||
torch.Tensor, # w_q
|
||||
torch.Tensor, # w_s
|
||||
Optional[torch.Tensor], # w_zp,
|
||||
Optional[torch.Tensor] # w_gidx
|
||||
]:
|
||||
return (
|
||||
getattr(layer, self.w_q_name),
|
||||
getattr(layer, self.w_s_name),
|
||||
|
||||
@@ -48,13 +48,13 @@ class ScaledMMLinearKernel(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_weight_params(
|
||||
self, layer: torch.nn.Module
|
||||
) -> Tuple[torch.Tensor, # weight
|
||||
torch.Tensor, # weight_scale
|
||||
Optional[torch.Tensor], # input_scale,
|
||||
Optional[torch.Tensor], # input_zp
|
||||
Optional[torch.Tensor], # azp_adj
|
||||
]:
|
||||
self, layer: torch.nn.Module) -> Tuple[
|
||||
torch.Tensor, # weight
|
||||
torch.Tensor, # weight_scale
|
||||
Optional[torch.Tensor], # input_scale,
|
||||
Optional[torch.Tensor], # input_zp
|
||||
Optional[torch.Tensor], # azp_adj
|
||||
]:
|
||||
return (
|
||||
getattr(layer, self.w_q_name),
|
||||
getattr(layer, self.w_s_name),
|
||||
|
||||
@@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
|
||||
x_dq_block = x_q_block.to(torch.float32)
|
||||
|
||||
x_dq_block_tiles = [[
|
||||
x_dq_block[j * block_n:min((j + 1) * block_n, n),
|
||||
i * block_k:min((i + 1) * block_k, k), ]
|
||||
for i in range(k_tiles)
|
||||
x_dq_block[
|
||||
j * block_n:min((j + 1) * block_n, n),
|
||||
i * block_k:min((i + 1) * block_k, k),
|
||||
] for i in range(k_tiles)
|
||||
] for j in range(n_tiles)]
|
||||
|
||||
for i in range(k_tiles):
|
||||
|
||||
@@ -73,8 +73,8 @@ def requantize_with_max_scale(
|
||||
# from disk in this case. Skip requantization in this case (since)
|
||||
# we already are quantized with the single scale.
|
||||
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
|
||||
unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
|
||||
torch.float8_e4m3fn).min)
|
||||
unfused_module_in_checkpoint = (weight_scale[-1]
|
||||
> torch.finfo(torch.float8_e4m3fn).min)
|
||||
|
||||
# If unfused checkpoint, need requanize with the single scale.
|
||||
if unfused_module_in_checkpoint:
|
||||
|
||||
@@ -716,9 +716,10 @@ def _sample_with_torch(
|
||||
tensors required for Pythonization
|
||||
'''
|
||||
|
||||
categorized_seq_group_ids: Dict[SamplingType,
|
||||
List[int]] = {t: []
|
||||
for t in SamplingType}
|
||||
categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
|
||||
t: []
|
||||
for t in SamplingType
|
||||
}
|
||||
categorized_sample_indices = sampling_metadata.categorized_sample_indices
|
||||
for i, seq_group in enumerate(sampling_metadata.seq_groups):
|
||||
sampling_params = seq_group.sampling_params
|
||||
|
||||
@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
|
||||
|
||||
def __post_init__(self):
|
||||
# sanity checks
|
||||
assert (self.padded_org_vocab_start_index <=
|
||||
self.padded_org_vocab_end_index)
|
||||
assert (self.padded_added_vocab_start_index <=
|
||||
self.padded_added_vocab_end_index)
|
||||
assert (self.padded_org_vocab_start_index
|
||||
<= self.padded_org_vocab_end_index)
|
||||
assert (self.padded_added_vocab_start_index
|
||||
<= self.padded_added_vocab_end_index)
|
||||
|
||||
assert self.org_vocab_start_index <= self.org_vocab_end_index
|
||||
assert self.added_vocab_start_index <= self.added_vocab_end_index
|
||||
|
||||
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
|
||||
assert (self.added_vocab_start_index <=
|
||||
self.padded_added_vocab_start_index)
|
||||
assert (self.added_vocab_start_index
|
||||
<= self.padded_added_vocab_start_index)
|
||||
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
|
||||
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
|
||||
|
||||
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
|
||||
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# torch.compile will fuse all of the pointwise ops below
|
||||
# into a single kernel, making it very fast
|
||||
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
|
||||
org_vocab_end_index)
|
||||
org_vocab_mask = (input_ >= org_vocab_start_index) & (
|
||||
input_ < org_vocab_end_index)
|
||||
added_vocab_mask = (input_ >= added_vocab_start_index) & (
|
||||
input_ < added_vocab_end_index)
|
||||
added_offset = added_vocab_start_index - (
|
||||
|
||||
@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
# from being incorrectly identified as being present in
|
||||
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
|
||||
shard_pos = quant_param_name.find(shard_name)
|
||||
can_correct_rename = (shard_pos > 0) and (
|
||||
quant_param_name[shard_pos - 1] == ".")
|
||||
can_correct_rename = (shard_pos
|
||||
> 0) and (quant_param_name[shard_pos - 1]
|
||||
== ".")
|
||||
# If the quant_param_name is packed, it won't occur in the
|
||||
# param_dict before renaming.
|
||||
new_quant_param_name = quant_param_name.replace(
|
||||
|
||||
@@ -298,8 +298,8 @@ class TensorizerAgent:
|
||||
to allow for adapter added tokens."""
|
||||
for child in self.model.modules():
|
||||
if (isinstance(child, VocabParallelEmbedding)
|
||||
and child.weight.shape[0] <
|
||||
child.num_embeddings_per_partition):
|
||||
and child.weight.shape[0]
|
||||
< child.num_embeddings_per_partition):
|
||||
new_weight = torch.empty(child.num_embeddings_per_partition,
|
||||
child.embedding_dim,
|
||||
dtype=child.weight.dtype,
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
||||
from functools import lru_cache
|
||||
from functools import cache
|
||||
from typing import Iterable, List, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def _get_gemma_act_fn(
|
||||
hidden_act: Optional[str],
|
||||
hidden_activation: Optional[str],
|
||||
|
||||
@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
for e in range(p.size(0)):
|
||||
w1_name = n.replace(
|
||||
'.block_sparse_moe.input_linear.weight',
|
||||
".block_sparse_moe.experts.%d.w1.weight" % e)
|
||||
f".block_sparse_moe.experts.{e}.w1.weight")
|
||||
w3_name = n.replace(
|
||||
'.block_sparse_moe.input_linear.weight',
|
||||
".block_sparse_moe.experts.%d.w3.weight" % e)
|
||||
f".block_sparse_moe.experts.{e}.w3.weight")
|
||||
w1_param, w3_param = p[e].chunk(2, dim=0)
|
||||
assert w1_name not in new_weights
|
||||
assert w3_name not in new_weights
|
||||
@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
for e in range(p.size(0)):
|
||||
w2_name = n.replace(
|
||||
'.block_sparse_moe.output_linear.weight',
|
||||
".block_sparse_moe.experts.%d.w2.weight" % e)
|
||||
f".block_sparse_moe.experts.{e}.w2.weight")
|
||||
w2_param = p[e]
|
||||
assert w2_name not in new_weights
|
||||
new_weights[w2_name] = w2_param
|
||||
|
||||
@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
# For 1) text-only prefill and decode, 2) image-present decode.
|
||||
if image_inputs is None:
|
||||
full_text_row_masked_out_mask = (
|
||||
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
|
||||
input_ids.device)
|
||||
attn_metadata.encoder_seq_lens_tensor
|
||||
!= 0).reshape(-1, 1).to(input_ids.device)
|
||||
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
|
||||
|
||||
# For image-present prefill.
|
||||
|
||||
@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
|
||||
|
||||
if self.tie_weights:
|
||||
assert (
|
||||
self.n_predict >
|
||||
1), "You cannot tie weights between stages when only 1 exists"
|
||||
self.n_predict > 1
|
||||
), "You cannot tie weights between stages when only 1 exists"
|
||||
embedding = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
self.inner_dim,
|
||||
|
||||
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
|
||||
# compute mask for sparsity
|
||||
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
|
||||
factor = scores.abs().clamp(min=mask_logits_threshold)
|
||||
mask_logits_threshold = (
|
||||
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
|
||||
mask_logits_threshold = ((mask_logits_threshold - scores) /
|
||||
factor) > (2 * jitter_eps)
|
||||
|
||||
# apply mask
|
||||
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
|
||||
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
|
||||
mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
|
||||
keepdim=True)
|
||||
factor = scores.abs().clamp(min=mask_logits_threshold)
|
||||
mask_logits_threshold = (
|
||||
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
|
||||
mask_logits_threshold = ((mask_logits_threshold - scores) /
|
||||
factor) > (2 * jitter_eps)
|
||||
|
||||
# apply mask
|
||||
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
|
||||
|
||||
@@ -462,7 +462,8 @@ class _ModelRegistry:
|
||||
|
||||
|
||||
ModelRegistry = _ModelRegistry({
|
||||
model_arch: _LazyRegisteredModel(
|
||||
model_arch:
|
||||
_LazyRegisteredModel(
|
||||
module_name=f"vllm.model_executor.models.{mod_relname}",
|
||||
class_name=cls_name,
|
||||
)
|
||||
|
||||
@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
|
||||
return hidden_states
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
|
||||
info=UltravoxProcessingInfo,
|
||||
dummy_inputs=UltravoxDummyInputsBuilder
|
||||
)
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
UltravoxMultiModalProcessor,
|
||||
info=UltravoxProcessingInfo,
|
||||
dummy_inputs=UltravoxDummyInputsBuilder)
|
||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
|
||||
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
|
||||
device: torch.device,
|
||||
) -> IntermediateTensors:
|
||||
return IntermediateTensors({
|
||||
key: torch.zeros((batch_size, hidden_size),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
key:
|
||||
torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
|
||||
for key in keys
|
||||
})
|
||||
|
||||
|
||||
@@ -166,7 +166,8 @@ class SamplingMetadata:
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
categorized_sample_indices = {
|
||||
t: async_tensor_h2d(
|
||||
t:
|
||||
async_tensor_h2d(
|
||||
seq_ids,
|
||||
dtype=torch.int,
|
||||
target_device=device,
|
||||
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
|
||||
device: str,
|
||||
generators: Optional[Dict[str, torch.Generator]] = None,
|
||||
cache: Optional[SamplingMetadataCache] = None,
|
||||
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
|
||||
List[int]], int, ]:
|
||||
) -> Tuple[
|
||||
List[SequenceGroupToSample],
|
||||
List[int],
|
||||
Dict[SamplingType, List[int]],
|
||||
int,
|
||||
]:
|
||||
"""Prepare sequence groups and indices for sampling.
|
||||
|
||||
Args:
|
||||
|
||||
Reference in New Issue
Block a user