[Bugfix][CI] fix typos (#34934)
Signed-off-by: 1195343015 <1195343015@qq.com> Signed-off-by: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel):
|
||||
weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
|
||||
weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1)
|
||||
# make 16 output channel as a block and transpose to the make
|
||||
# the block contigous
|
||||
# the block contiguous
|
||||
weight = (
|
||||
weight.view(input_size, -1, 16 // pack_factor)
|
||||
.permute(1, 0, 2)
|
||||
|
||||
@@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
||||
)
|
||||
# workspace
|
||||
# |------- N tokens --------|--------- N*dcp_size tokens ----------|
|
||||
# |<- use for loca_gather ->|<--------- use for allgather -------->|
|
||||
# |<- use for local_gather ->|<--------- use for allgather -------->|
|
||||
allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
|
||||
assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0]
|
||||
assert toks <= allgather_offset
|
||||
|
||||
@@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
|
||||
|
||||
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
|
||||
# No support for LoRA in flashinfer_cutlass_fused_moe.
|
||||
# See TODOs in flashinfer functions runMoe and runMoeMinLantency.
|
||||
# See TODOs in flashinfer functions runMoe and runMoeMinLatency.
|
||||
raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
|
||||
|
||||
@@ -409,7 +409,7 @@ def batched_fused_marlin_moe(
|
||||
Note that the moe_align_block_size function indicates,
|
||||
- What rows of the A matrix (hidden_states) to access during the
|
||||
matmul, via sorted_ids output.
|
||||
- What expert_id to use for each block matmul, via expert_ids ouptut.
|
||||
- What expert_id to use for each block matmul, via expert_ids output.
|
||||
|
||||
In the batched version, the tokens are already grouped/batched by experts
|
||||
they subscribe to. Due to this, we can represent the batched hidden_states
|
||||
|
||||
@@ -606,7 +606,7 @@ class FusedMoEExperts(ABC):
|
||||
"""
|
||||
Whether the kernel supports deployment in particular parallel config.
|
||||
|
||||
Can be overriden if a kernel does not support EP, SP or some other
|
||||
Can be overridden if a kernel does not support EP, SP or some other
|
||||
configuration.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -620,7 +620,7 @@ class FusedMoEExperts(ABC):
|
||||
"""
|
||||
Whether the kernel supports a routing method (e.g. GroupedTopK).
|
||||
|
||||
Can be overriden by monolithic kernels that execute the router
|
||||
Can be overridden by monolithic kernels that execute the router
|
||||
in addition to the experts if certain routers are not supported.
|
||||
"""
|
||||
return True
|
||||
@@ -633,7 +633,7 @@ class FusedMoEExperts(ABC):
|
||||
"""
|
||||
Whether a kernel supports a particular dtype for router logits input.
|
||||
|
||||
Can be overriden by monolithic kernels that execute the router
|
||||
Can be overridden by monolithic kernels that execute the router
|
||||
in addition to the experts if certain dtypes are not supported.
|
||||
"""
|
||||
return True
|
||||
|
||||
@@ -1502,10 +1502,10 @@ class RowParallelLinear(LinearBase):
|
||||
if self.input_is_parallel:
|
||||
input_parallel = input_
|
||||
else:
|
||||
splitted_input = split_tensor_along_last_dim(
|
||||
split_input = split_tensor_along_last_dim(
|
||||
input_, num_partitions=self.tp_size
|
||||
)
|
||||
input_parallel = splitted_input[self.tp_rank].contiguous()
|
||||
input_parallel = split_input[self.tp_rank].contiguous()
|
||||
|
||||
# Matrix multiply.
|
||||
assert self.quant_method is not None
|
||||
|
||||
@@ -35,7 +35,7 @@ class MultiHeadLatentAttentionWrapper(PluggableLayer):
|
||||
"""Pluggable MLA layer which allows OOT backends to add
|
||||
custom implementations of the outer MLA layer (including rope & o_proj).
|
||||
Note that currently oot platforms can still use CustomOp.register_oot to
|
||||
replace MLA layer entirly, although we use PluggableLayer to register
|
||||
replace MLA layer entirely, although we use PluggableLayer to register
|
||||
this layer now.
|
||||
|
||||
This class takes positions and hidden_states as input.
|
||||
|
||||
@@ -191,7 +191,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
"""
|
||||
Helper function to update target_scheme_map
|
||||
since linear layers get fused into FusedMoE
|
||||
targetting 'Linear' needs to also match
|
||||
targeting 'Linear' needs to also match
|
||||
FusedMoE modules.
|
||||
"""
|
||||
if (
|
||||
|
||||
@@ -2445,7 +2445,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
w2_scale=layer.w2_weight_scale, # group scale
|
||||
g1_alphas=layer.w13_weight_chan_scale,
|
||||
g2_alphas=layer.w2_weight_chan_scale,
|
||||
per_act_token_quant=True, # always use dynamc per-token
|
||||
per_act_token_quant=True, # always use dynamic per-token
|
||||
per_out_ch_quant=True, # always use per-channel
|
||||
)
|
||||
|
||||
|
||||
@@ -261,7 +261,7 @@ class CPUAWQLinearMethod(LinearMethodBase):
|
||||
|
||||
zeros = pack_cols(zeros, bits, group_num, output_size).contiguous()
|
||||
# make 16 output channel as a block and transpose to
|
||||
# the make the block contigous
|
||||
# the make the block contiguous
|
||||
weight = pack_cols(weight, bits, input_size, output_size)
|
||||
weight = (
|
||||
weight.view(input_size, -1, 16 // pack_factor)
|
||||
|
||||
@@ -199,7 +199,7 @@ class TorchAOConfig(QuantizationConfig):
|
||||
|
||||
@classmethod
|
||||
def from_config_dict_json(cls, config_dict_json: str) -> "TorchAOConfig":
|
||||
"""Iniitalize class from a config_dict json string, got from
|
||||
"""Initialize class from a config_dict json string, got from
|
||||
torchao_config_object = some AOBaseConfig object
|
||||
json.dumps(config_to_dict(torchao_config_object))
|
||||
"""
|
||||
|
||||
@@ -255,7 +255,7 @@ def _flashinfer_fp8_blockscale_gemm_impl(
|
||||
|
||||
This batch-size-dependent selection is essential for maintaining model accuracy.
|
||||
Benchmarks on GSM8K show a significant accuracy gap (88% vs 95%) for DeepSeek-V3.1
|
||||
when using FlashInfer's DeepGEMM on M>=32. The M < 32 strategy fixes the accurracy
|
||||
when using FlashInfer's DeepGEMM on M>=32. The M < 32 strategy fixes the accuracy
|
||||
drop.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -39,7 +39,7 @@ def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]:
|
||||
|
||||
|
||||
def check_machete_supports_shape(
|
||||
in_features: int, out_featrues: int
|
||||
in_features: int, out_features: int
|
||||
) -> tuple[bool, str | None]:
|
||||
if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
|
||||
return (
|
||||
@@ -47,7 +47,7 @@ def check_machete_supports_shape(
|
||||
"Input features size must be divisible by "
|
||||
f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}",
|
||||
)
|
||||
if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
|
||||
if out_features % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
|
||||
return (
|
||||
False,
|
||||
"Output features size must be divisible by "
|
||||
|
||||
@@ -237,7 +237,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
Arguments of apply_rotary_emb() in vllm_flash_attn:
|
||||
x: [batch_size, seq_len, nheads, headdim]
|
||||
cos, sin: [seqlen_rotary, rotary_dim / 2]
|
||||
interleaved: defalut as False (Neox-style).
|
||||
interleaved: default as False (Neox-style).
|
||||
...
|
||||
"""
|
||||
interleaved = not self.is_neox_style
|
||||
@@ -259,7 +259,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
Arguments of apply_rotary() in flash_attn:
|
||||
x: [batch_size, seq_len, nheads, headdim]
|
||||
cos, sin: [seqlen_rotary, rotary_dim / 2]
|
||||
interleaved: defalut as False (Neox-style).
|
||||
interleaved: default as False (Neox-style).
|
||||
...
|
||||
"""
|
||||
interleaved = not self.is_neox_style
|
||||
|
||||
@@ -342,7 +342,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
|
||||
visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool()
|
||||
text_token_mask = ~visual_token_mask
|
||||
final_experts_hidden_states = torch.zeros_like(hidden_states)
|
||||
final_shared_ouput = (
|
||||
final_shared_output = (
|
||||
torch.zeros_like(hidden_states) if self.has_shared_experts else None
|
||||
)
|
||||
|
||||
@@ -356,26 +356,26 @@ class Ernie4_5_VLMoeMoE(nn.Module):
|
||||
text_router_logits, _ = self.text_experts_gate(
|
||||
text_hidden_states.to(dtype=torch.float32)
|
||||
)
|
||||
text_shared_ouput, text_experts_output = self.text_experts(
|
||||
text_shared_output, text_experts_output = self.text_experts(
|
||||
hidden_states=text_hidden_states, router_logits=text_router_logits
|
||||
)
|
||||
final_experts_hidden_states[text_token_mask] = text_experts_output.flatten()
|
||||
if self.has_shared_experts:
|
||||
final_shared_ouput[text_token_mask] = text_shared_ouput.flatten()
|
||||
final_shared_output[text_token_mask] = text_shared_output.flatten()
|
||||
|
||||
vision_router_logits, _ = self.vision_experts_gate(
|
||||
vision_hidden_states.to(dtype=torch.float32)
|
||||
)
|
||||
vision_shared_ouput, vision_experts_output = self.vision_experts(
|
||||
vision_shared_output, vision_experts_output = self.vision_experts(
|
||||
hidden_states=vision_hidden_states, router_logits=vision_router_logits
|
||||
)
|
||||
final_experts_hidden_states[visual_token_mask] = (
|
||||
vision_experts_output.flatten()
|
||||
)
|
||||
if self.has_shared_experts:
|
||||
final_shared_ouput[visual_token_mask] = vision_shared_ouput.flatten()
|
||||
final_shared_output[visual_token_mask] = vision_shared_output.flatten()
|
||||
|
||||
final_hidden_states = (final_shared_ouput, final_experts_hidden_states)
|
||||
final_hidden_states = (final_shared_output, final_experts_hidden_states)
|
||||
else:
|
||||
# only text modal input
|
||||
text_router_logits, _ = self.text_experts_gate(
|
||||
|
||||
@@ -107,7 +107,7 @@ class Conv2dSubsampling(nn.Module):
|
||||
)
|
||||
|
||||
self.subsampling = 4
|
||||
left_context = right_context = 3 # both exclude currect frame
|
||||
left_context = right_context = 3 # both exclude current frame
|
||||
self.context = left_context + 1 + right_context # 7
|
||||
|
||||
def forward(
|
||||
|
||||
@@ -115,7 +115,7 @@ class EncoderLayerSANM(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
mask: torch.Tensor | None = None,
|
||||
cache=None,
|
||||
mask_shfit_chunk=None,
|
||||
mask_shift_chunk=None,
|
||||
mask_att_chunk_encoder=None,
|
||||
):
|
||||
residual = hidden_states
|
||||
@@ -125,14 +125,14 @@ class EncoderLayerSANM(nn.Module):
|
||||
hidden_states = residual + self.self_attn(
|
||||
hidden_states,
|
||||
mask,
|
||||
mask_shfit_chunk=mask_shfit_chunk,
|
||||
mask_shift_chunk=mask_shift_chunk,
|
||||
mask_att_chunk_encoder=mask_att_chunk_encoder,
|
||||
)
|
||||
else:
|
||||
hidden_states = self.self_attn(
|
||||
hidden_states,
|
||||
mask,
|
||||
mask_shfit_chunk=mask_shfit_chunk,
|
||||
mask_shift_chunk=mask_shift_chunk,
|
||||
mask_att_chunk_encoder=mask_att_chunk_encoder,
|
||||
)
|
||||
|
||||
@@ -140,7 +140,7 @@ class EncoderLayerSANM(nn.Module):
|
||||
hidden_states = self.norm2(hidden_states)
|
||||
hidden_states = residual + self.feed_forward(hidden_states)
|
||||
|
||||
return hidden_states, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
|
||||
return hidden_states, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
|
||||
|
||||
|
||||
class MultiHeadedAttentionSANM(nn.Module):
|
||||
@@ -183,13 +183,13 @@ class MultiHeadedAttentionSANM(nn.Module):
|
||||
self,
|
||||
inputs: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
mask_shfit_chunk: torch.Tensor = None,
|
||||
mask_shift_chunk: torch.Tensor = None,
|
||||
):
|
||||
b, t, d = inputs.size()
|
||||
if mask is not None:
|
||||
mask = torch.reshape(mask, (b, -1, 1))
|
||||
if mask_shfit_chunk is not None:
|
||||
mask = mask * mask_shfit_chunk
|
||||
if mask_shift_chunk is not None:
|
||||
mask = mask * mask_shift_chunk
|
||||
inputs = inputs * mask
|
||||
|
||||
x = inputs.transpose(1, 2)
|
||||
@@ -243,11 +243,11 @@ class MultiHeadedAttentionSANM(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
mask_shfit_chunk: torch.Tensor = None,
|
||||
mask_shift_chunk: torch.Tensor = None,
|
||||
mask_att_chunk_encoder: torch.Tensor = None,
|
||||
):
|
||||
q_h, k_h, v_h, v = self.forward_qkv(hidden_states)
|
||||
fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
|
||||
fsmn_memory = self.forward_fsmn(v, mask, mask_shift_chunk)
|
||||
q_h = q_h * self.d_k ** (-0.5)
|
||||
scores = torch.matmul(q_h, k_h.transpose(-2, -1))
|
||||
att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
|
||||
|
||||
@@ -646,7 +646,7 @@ class IsaacImageProcessor:
|
||||
return_tensors: str | TensorType | None,
|
||||
**kwargs: Unpack[IsaacImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""Preprocess images into format compatibile with vLLM input processing."""
|
||||
"""Preprocess images into format compatible with vLLM input processing."""
|
||||
|
||||
all_pixel_values: list[torch.Tensor] = []
|
||||
all_image_grids: list[torch.Tensor] = []
|
||||
|
||||
@@ -299,7 +299,7 @@ class KeyeVisionEmbeddings(nn.Module):
|
||||
)
|
||||
(
|
||||
batch_size,
|
||||
squence_len,
|
||||
sequence_len,
|
||||
channel,
|
||||
height,
|
||||
width,
|
||||
|
||||
@@ -238,7 +238,7 @@ class LongcatRouter(nn.Module):
|
||||
self,
|
||||
config: FlashConfig,
|
||||
zero_expert_num: int,
|
||||
rounter_params_dtype: torch.dtype,
|
||||
router_params_dtype: torch.dtype,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
@@ -252,12 +252,12 @@ class LongcatRouter(nn.Module):
|
||||
config.hidden_size,
|
||||
self.n_routed_experts,
|
||||
bias=config.router_bias,
|
||||
params_dtype=rounter_params_dtype,
|
||||
params_dtype=router_params_dtype,
|
||||
quant_config=None,
|
||||
prefix=f"{prefix}.classifier",
|
||||
)
|
||||
self.e_score_correction_bias = nn.Parameter(
|
||||
torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
|
||||
torch.zeros((self.n_routed_experts), dtype=router_params_dtype)
|
||||
)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
@@ -281,14 +281,14 @@ class LongcatMoe(nn.Module):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
# Gate always runs at half / full precision for now.
|
||||
self.rounter_params_dtype = params_dtype
|
||||
self.router_params_dtype = params_dtype
|
||||
if config.router_dtype == "float32":
|
||||
self.rounter_params_dtype = torch.float32
|
||||
self.router_params_dtype = torch.float32
|
||||
|
||||
self.router = LongcatRouter(
|
||||
config=config,
|
||||
zero_expert_num=config.zero_expert_num,
|
||||
rounter_params_dtype=self.rounter_params_dtype,
|
||||
router_params_dtype=self.router_params_dtype,
|
||||
prefix=f"{prefix}.gate",
|
||||
)
|
||||
|
||||
@@ -309,7 +309,7 @@ class LongcatMoe(nn.Module):
|
||||
prefix=f"{prefix}.experts",
|
||||
enable_eplb=enable_eplb,
|
||||
routed_scaling_factor=config.routed_scaling_factor,
|
||||
router_logits_dtype=self.rounter_params_dtype,
|
||||
router_logits_dtype=self.router_params_dtype,
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
@@ -329,7 +329,7 @@ class LongcatMoe(nn.Module):
|
||||
hidden_states_padded = hidden_states
|
||||
|
||||
router_logits_full = self.router(
|
||||
hidden_states_padded.to(self.rounter_params_dtype)
|
||||
hidden_states_padded.to(self.router_params_dtype)
|
||||
)
|
||||
|
||||
# ZeroExpertFusedMoE handles routing memoization and zero expert computation
|
||||
|
||||
@@ -1321,14 +1321,14 @@ def get_image_size(image: ImageInput) -> ImageSize:
|
||||
raise ValueError(f"Unknown image type: {type(image)}")
|
||||
|
||||
|
||||
def exif_tranpose(
|
||||
def exif_transpose(
|
||||
images: ImageInput | None,
|
||||
) -> ImageInput | None:
|
||||
if images is None:
|
||||
return None
|
||||
if images is not None and isinstance(images, (list, tuple)):
|
||||
images = [
|
||||
exif_tranpose(img) if isinstance(img, Image) else img for img in images
|
||||
exif_transpose(img) if isinstance(img, Image) else img for img in images
|
||||
]
|
||||
elif images is not None and isinstance(images, Image):
|
||||
images = ImageOps.exif_transpose(images)
|
||||
@@ -1667,7 +1667,7 @@ class Molmo2ProcessorWrapper:
|
||||
**kwargs: object,
|
||||
) -> BatchFeature:
|
||||
inputs = [text]
|
||||
images = exif_tranpose(images)
|
||||
images = exif_transpose(images)
|
||||
if getattr(self.processor, "image_processor", None) is not None:
|
||||
inputs.append(images)
|
||||
if getattr(self.processor, "video_processor", None) is not None:
|
||||
@@ -2352,7 +2352,7 @@ class Molmo2MultiModalProcessor(BaseMultiModalProcessor[Molmo2ProcessingInfo]):
|
||||
def get_image_replacement_molmo2(item_idx: int) -> list[int]:
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image = images.get(item_idx)
|
||||
image = exif_tranpose(image)
|
||||
image = exif_transpose(image)
|
||||
|
||||
resize_nrows, resize_cols = processor.get_base_grid_size(is_video=False)
|
||||
if use_single_crop_col_tokens is not None:
|
||||
|
||||
@@ -349,7 +349,7 @@ class NemotronHMoEDecoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
# Get per-layer config for heterogeneous models if exsist
|
||||
# Get per-layer config for heterogeneous models if exists
|
||||
get_layer_config = getattr(config, "get_nemotron_h_config_for_layer", None)
|
||||
layer_config = get_layer_config(layer_idx) if get_layer_config else config
|
||||
|
||||
@@ -517,7 +517,7 @@ class NemotronHAttentionDecoderLayer(nn.Module):
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
# Get per-layer config for heterogeneous models if exsist
|
||||
# Get per-layer config for heterogeneous models if exists
|
||||
get_layer_config = getattr(config, "get_nemotron_h_config_for_layer", None)
|
||||
layer_config = get_layer_config(layer_idx) if get_layer_config else config
|
||||
|
||||
|
||||
@@ -486,7 +486,7 @@ class SiglipVisionEmbeddings(nn.Module):
|
||||
)
|
||||
(
|
||||
batch_size,
|
||||
squence_len,
|
||||
sequence_len,
|
||||
channel,
|
||||
height,
|
||||
width,
|
||||
|
||||
@@ -689,19 +689,19 @@ class ConformerEncoder(TransformerEncoderBase):
|
||||
default False.
|
||||
ext_pw_out_channel: int, optional
|
||||
the number of channel for CNN
|
||||
before depthwise_seperable_CNN.
|
||||
before depthwise_separable_CNN.
|
||||
If 0 then use linear. default 0.
|
||||
ext_pw_kernel_size: int, optional
|
||||
kernel size of N before depthwise_seperable_CNN.
|
||||
kernel size of N before depthwise_separable_CNN.
|
||||
only work for ext_pw_out_channel > 0.
|
||||
default 1
|
||||
depthwise_seperable_out_channel: int, optional
|
||||
the number of channel for
|
||||
depthwise_seperable_CNN.
|
||||
depthwise_separable_CNN.
|
||||
default 256.
|
||||
depthwise_multiplier: int, optional
|
||||
the number of multiplier for
|
||||
depthwise_seperable_CNN.
|
||||
depthwise_separable_CNN.
|
||||
default 1.
|
||||
chunk_se: int, optional
|
||||
0 for offline SE.
|
||||
@@ -711,7 +711,7 @@ class ConformerEncoder(TransformerEncoderBase):
|
||||
by only the current chunk.
|
||||
default 0.
|
||||
kernel_size: int, optional
|
||||
the number of kernels for depthwise_seperable_CNN.
|
||||
the number of kernels for depthwise_separable_CNN.
|
||||
default 3.
|
||||
activation: str, optional
|
||||
FeedForward block activation.
|
||||
@@ -721,7 +721,7 @@ class ConformerEncoder(TransformerEncoderBase):
|
||||
activation function used in ConvModule part
|
||||
of the conformer, default "relu".
|
||||
conv_glu_type: str, optional
|
||||
activation used use glu in depthwise_seperable_CNN,
|
||||
activation used use glu in depthwise_separable_CNN,
|
||||
default "sigmoid"
|
||||
bias_in_glu: bool, optional
|
||||
if set to True, use additive bias in the weight module
|
||||
|
||||
@@ -217,8 +217,8 @@ class GLUPointWiseConv(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
class DepthWiseSeperableConv1d(nn.Module):
|
||||
"""DepthWiseSeperableConv1d module used in Convnet module
|
||||
class DepthWiseSeparableConv1d(nn.Module):
|
||||
"""DepthWiseSeparableConv1d module used in ConvNet module
|
||||
for the conformer, for more details see:
|
||||
https://arxiv.org/pdf/2005.08100v1.pdf
|
||||
|
||||
@@ -390,7 +390,7 @@ class ConvModule(nn.Module):
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2
|
||||
|
||||
self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
|
||||
self.dw_sep_conv_1d = DepthWiseSeparableConv1d(
|
||||
input_dim,
|
||||
depthwise_seperable_out_channel,
|
||||
kernel_size,
|
||||
|
||||
@@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
self, max_pixels: int | None = None
|
||||
) -> ImageSize:
|
||||
# NOTE: Simply processing a huge size with _get_vision_info might not give a
|
||||
# size that maximizes the number of featrues, i.e., the number of (merged)
|
||||
# size that maximizes the number of features, i.e., the number of (merged)
|
||||
# patches. This is because the number of patches limits the allowed aspect
|
||||
# ratios. For example, suppose the maximum number of patches is 1280. A square
|
||||
# image cannot be broken down into 1280 patches, so feeding a giant square image
|
||||
|
||||
@@ -459,14 +459,14 @@ class Step3VLProcessor:
|
||||
image_inputs = {}
|
||||
text_inputs = self.tokenizer(text)
|
||||
else:
|
||||
splitted_images_data = self._split_images(images)
|
||||
split_images_data = self._split_images(images)
|
||||
pixel_values_lst = []
|
||||
patch_pixel_values_lst = []
|
||||
patch_newline_mask_lst = []
|
||||
image_repl_str_lst = []
|
||||
image_repl_ids_lst = []
|
||||
num_patches = []
|
||||
for raw_img, img_patches, patch_newline_mask in splitted_images_data:
|
||||
for raw_img, img_patches, patch_newline_mask in split_images_data:
|
||||
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
|
||||
|
||||
if len(img_patches) > 0:
|
||||
|
||||
@@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module):
|
||||
if swiglu_limit not in (None, 0):
|
||||
swiglu_limit = float(swiglu_limit)
|
||||
assert swiglu_limit == 7.0, (
|
||||
"Swiglu limit in fused moe block only suport 7.0 now."
|
||||
"Swiglu limit in fused moe block only support 7.0 now."
|
||||
)
|
||||
activation = "swiglustep"
|
||||
logger.debug(
|
||||
|
||||
Reference in New Issue
Block a user