Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -17,28 +17,32 @@ import torch.nn.functional as F
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
split_tensor_along_last_dim,
|
||||
tensor_model_parallel_all_gather)
|
||||
from vllm.distributed import (
|
||||
divide,
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
split_tensor_along_last_dim,
|
||||
tensor_model_parallel_all_gather,
|
||||
)
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from .vision import run_dp_sharded_vision_model
|
||||
|
||||
NORM2FN = {
|
||||
'rms_norm': RMSNorm,
|
||||
'layer_norm': nn.LayerNorm,
|
||||
"rms_norm": RMSNorm,
|
||||
"layer_norm": nn.LayerNorm,
|
||||
}
|
||||
|
||||
|
||||
class InternVisionEmbeddings(nn.Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -48,28 +52,36 @@ class InternVisionEmbeddings(nn.Module):
|
||||
|
||||
self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
|
||||
|
||||
self.patch_embedding = nn.Conv2d(in_channels=3,
|
||||
out_channels=self.embed_dim,
|
||||
kernel_size=self.patch_size,
|
||||
stride=self.patch_size)
|
||||
self.patch_embedding = nn.Conv2d(
|
||||
in_channels=3,
|
||||
out_channels=self.embed_dim,
|
||||
kernel_size=self.patch_size,
|
||||
stride=self.patch_size,
|
||||
)
|
||||
|
||||
self.num_patches = (self.image_size // self.patch_size)**2
|
||||
self.num_patches = (self.image_size // self.patch_size) ** 2
|
||||
self.num_positions = self.num_patches + 1
|
||||
|
||||
self.position_embedding = nn.Parameter(
|
||||
torch.randn(1, self.num_positions, self.embed_dim))
|
||||
torch.randn(1, self.num_positions, self.embed_dim)
|
||||
)
|
||||
|
||||
def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int):
|
||||
target_dtype = pos_embed.dtype
|
||||
pos_embed = pos_embed.float().reshape(
|
||||
1, self.image_size // self.patch_size,
|
||||
self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
|
||||
pos_embed = F.interpolate(pos_embed,
|
||||
size=(H, W),
|
||||
mode='bicubic',
|
||||
align_corners=False)
|
||||
return pos_embed.reshape(1, -1, H * W).permute(0, 2,
|
||||
1).to(target_dtype)
|
||||
pos_embed = (
|
||||
pos_embed.float()
|
||||
.reshape(
|
||||
1,
|
||||
self.image_size // self.patch_size,
|
||||
self.image_size // self.patch_size,
|
||||
-1,
|
||||
)
|
||||
.permute(0, 3, 1, 2)
|
||||
)
|
||||
pos_embed = F.interpolate(
|
||||
pos_embed, size=(H, W), mode="bicubic", align_corners=False
|
||||
)
|
||||
return pos_embed.reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
|
||||
|
||||
def _get_position_embedding(self, H: int, W: int) -> torch.Tensor:
|
||||
position_embedding = self.position_embedding
|
||||
@@ -86,12 +98,12 @@ class InternVisionEmbeddings(nn.Module):
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
target_dtype = self.patch_embedding.weight.dtype
|
||||
patch_embeds = self.patch_embedding(pixel_values.to(
|
||||
target_dtype)) # shape = [*, channel, width, height]
|
||||
patch_embeds = self.patch_embedding(
|
||||
pixel_values.to(target_dtype)
|
||||
) # shape = [*, channel, width, height]
|
||||
batch_size, _, height, width = patch_embeds.shape
|
||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1,
|
||||
-1).to(target_dtype)
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
|
||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||
position_embedding = self._get_position_embedding(height, width)
|
||||
embeddings = embeddings + position_embedding.to(target_dtype)
|
||||
@@ -99,7 +111,6 @@ class InternVisionEmbeddings(nn.Module):
|
||||
|
||||
|
||||
class InternVisionPatchModel(nn.Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@@ -114,8 +125,7 @@ class InternVisionPatchModel(nn.Module):
|
||||
pixel_embeds: Optional[torch.Tensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
if pixel_values is None and pixel_embeds is None:
|
||||
raise ValueError(
|
||||
'You have to specify pixel_values or pixel_embeds')
|
||||
raise ValueError("You have to specify pixel_values or pixel_embeds")
|
||||
|
||||
if pixel_embeds is not None:
|
||||
hidden_states = pixel_embeds
|
||||
@@ -123,8 +133,7 @@ class InternVisionPatchModel(nn.Module):
|
||||
if pixel_values.ndim == 4:
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'wrong pixel_values size: {pixel_values.shape}')
|
||||
raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
|
||||
|
||||
return hidden_states
|
||||
|
||||
@@ -149,19 +158,21 @@ class InternParallelAttention(nn.Module):
|
||||
self.head_dim = self.embed_dim // self.num_heads
|
||||
if self.head_dim * self.num_heads != self.embed_dim:
|
||||
raise ValueError(
|
||||
f'embed_dim must be divisible by num_heads '
|
||||
f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
|
||||
f' {self.num_heads}).')
|
||||
f"embed_dim must be divisible by num_heads "
|
||||
f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
|
||||
f" {self.num_heads})."
|
||||
)
|
||||
|
||||
self.tp_size = (1 if use_data_parallel else
|
||||
get_tensor_model_parallel_world_size())
|
||||
self.tp_rank = (0 if use_data_parallel else
|
||||
get_tensor_model_parallel_rank())
|
||||
self.tp_size = (
|
||||
1 if use_data_parallel else get_tensor_model_parallel_world_size()
|
||||
)
|
||||
self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
|
||||
|
||||
# Additional dummy heads are used to enable TP for common GPU counts.
|
||||
self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
|
||||
self.num_heads_per_partition = divide(num_dummy_heads + self.num_heads,
|
||||
self.tp_size)
|
||||
self.num_heads_per_partition = divide(
|
||||
num_dummy_heads + self.num_heads, self.tp_size
|
||||
)
|
||||
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.qkv = QKVParallelLinear(
|
||||
@@ -177,12 +188,16 @@ class InternParallelAttention(nn.Module):
|
||||
self.qk_normalization = config.qk_normalization
|
||||
|
||||
if self.qk_normalization:
|
||||
self.q_norm = RMSNorm(self.dummy_dim,
|
||||
eps=config.layer_norm_eps,
|
||||
var_hidden_size=self.embed_dim)
|
||||
self.k_norm = RMSNorm(self.dummy_dim,
|
||||
eps=config.layer_norm_eps,
|
||||
var_hidden_size=self.embed_dim)
|
||||
self.q_norm = RMSNorm(
|
||||
self.dummy_dim,
|
||||
eps=config.layer_norm_eps,
|
||||
var_hidden_size=self.embed_dim,
|
||||
)
|
||||
self.k_norm = RMSNorm(
|
||||
self.dummy_dim,
|
||||
eps=config.layer_norm_eps,
|
||||
var_hidden_size=self.embed_dim,
|
||||
)
|
||||
|
||||
self.proj = RowParallelLinear(
|
||||
self.dummy_dim,
|
||||
@@ -192,8 +207,9 @@ class InternParallelAttention(nn.Module):
|
||||
disable_tp=use_data_parallel,
|
||||
)
|
||||
|
||||
self.attn = MultiHeadAttention(self.num_heads_per_partition,
|
||||
self.head_dim, self.scale)
|
||||
self.attn = MultiHeadAttention(
|
||||
self.num_heads_per_partition, self.head_dim, self.scale
|
||||
)
|
||||
|
||||
def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
|
||||
if self.tp_size > 1:
|
||||
@@ -202,8 +218,7 @@ class InternParallelAttention(nn.Module):
|
||||
q = self.q_norm(q)
|
||||
k = self.k_norm(k)
|
||||
if self.tp_size > 1:
|
||||
splitter = partial(split_tensor_along_last_dim,
|
||||
num_partitions=self.tp_size)
|
||||
splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
|
||||
q = splitter(q)[self.tp_rank]
|
||||
k = splitter(k)[self.tp_rank]
|
||||
return q, k
|
||||
@@ -222,7 +237,6 @@ class InternParallelAttention(nn.Module):
|
||||
|
||||
|
||||
class InternMLP(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
@@ -234,18 +248,22 @@ class InternMLP(nn.Module):
|
||||
|
||||
self.config = config
|
||||
self.activation_fn = get_act_fn(config.hidden_act)
|
||||
self.fc1 = ColumnParallelLinear(config.hidden_size,
|
||||
config.intermediate_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc1",
|
||||
disable_tp=use_data_parallel)
|
||||
self.fc2 = RowParallelLinear(config.intermediate_size,
|
||||
config.hidden_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc2",
|
||||
disable_tp=use_data_parallel)
|
||||
self.fc1 = ColumnParallelLinear(
|
||||
config.hidden_size,
|
||||
config.intermediate_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc1",
|
||||
disable_tp=use_data_parallel,
|
||||
)
|
||||
self.fc2 = RowParallelLinear(
|
||||
config.intermediate_size,
|
||||
config.hidden_size,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.fc2",
|
||||
disable_tp=use_data_parallel,
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
hidden_states, _ = self.fc1(hidden_states)
|
||||
@@ -256,7 +274,6 @@ class InternMLP(nn.Module):
|
||||
|
||||
|
||||
class InternVisionEncoderLayer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
@@ -272,25 +289,25 @@ class InternVisionEncoderLayer(nn.Module):
|
||||
self.intermediate_size = config.intermediate_size
|
||||
self.norm_type = config.norm_type
|
||||
|
||||
self.attn = self._init_attn(config,
|
||||
quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_data_parallel=use_data_parallel)
|
||||
self.attn = self._init_attn(
|
||||
config,
|
||||
quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=f"{prefix}.attn",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
|
||||
self.mlp = InternMLP(config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
use_data_parallel=use_data_parallel)
|
||||
self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
|
||||
eps=config.layer_norm_eps)
|
||||
self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
|
||||
eps=config.layer_norm_eps)
|
||||
self.mlp = InternMLP(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.mlp",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
self.ls1 = nn.Parameter(config.initializer_factor *
|
||||
torch.ones(self.embed_dim))
|
||||
self.ls2 = nn.Parameter(config.initializer_factor *
|
||||
torch.ones(self.embed_dim))
|
||||
self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
|
||||
self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
|
||||
|
||||
def _init_attn(
|
||||
self,
|
||||
@@ -302,35 +319,34 @@ class InternVisionEncoderLayer(nn.Module):
|
||||
use_data_parallel: bool = False,
|
||||
):
|
||||
# fallback to sdpa attention if tp unavailable
|
||||
tp_size = (1 if use_data_parallel else
|
||||
get_tensor_model_parallel_world_size())
|
||||
tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
|
||||
num_heads = config.num_attention_heads
|
||||
|
||||
# if the number of heads is not divisible by tp_size,
|
||||
# we also disable Attention's TP
|
||||
use_data_parallel = (use_data_parallel
|
||||
or (num_heads + num_dummy_heads) % tp_size != 0)
|
||||
return InternParallelAttention(config,
|
||||
quant_config=quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=prefix,
|
||||
use_data_parallel=use_data_parallel)
|
||||
use_data_parallel = (
|
||||
use_data_parallel or (num_heads + num_dummy_heads) % tp_size != 0
|
||||
)
|
||||
return InternParallelAttention(
|
||||
config,
|
||||
quant_config=quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=prefix,
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
):
|
||||
hidden_states = hidden_states + self.attn(
|
||||
self.norm1(hidden_states)) * self.ls1
|
||||
hidden_states = hidden_states + self.attn(self.norm1(hidden_states)) * self.ls1
|
||||
|
||||
hidden_states = hidden_states + self.mlp(
|
||||
self.norm2(hidden_states)) * self.ls2
|
||||
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) * self.ls2
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class InternVisionEncoder(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
@@ -350,17 +366,20 @@ class InternVisionEncoder(nn.Module):
|
||||
else:
|
||||
num_hidden_layers = num_hidden_layers_override
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
InternVisionEncoderLayer(config,
|
||||
quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
use_data_parallel=use_data_parallel)
|
||||
for layer_idx in range(num_hidden_layers)
|
||||
])
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
InternVisionEncoderLayer(
|
||||
config,
|
||||
quant_config,
|
||||
num_dummy_heads=num_dummy_heads,
|
||||
prefix=f"{prefix}.layers.{layer_idx}",
|
||||
use_data_parallel=use_data_parallel,
|
||||
)
|
||||
for layer_idx in range(num_hidden_layers)
|
||||
]
|
||||
)
|
||||
|
||||
def forward(self, inputs_embeds: torch.Tensor):
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
for encoder_layer in self.layers:
|
||||
hidden_states = encoder_layer(hidden_states)
|
||||
@@ -369,7 +388,6 @@ class InternVisionEncoder(nn.Module):
|
||||
|
||||
|
||||
class InternVisionModel(nn.Module):
|
||||
|
||||
packed_modules_mapping = {
|
||||
"qkv": ["qkv"],
|
||||
}
|
||||
@@ -408,8 +426,7 @@ class InternVisionModel(nn.Module):
|
||||
pixel_embeds: Optional[torch.Tensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
if pixel_values is None and pixel_embeds is None:
|
||||
raise ValueError(
|
||||
'You have to specify pixel_values or pixel_embeds')
|
||||
raise ValueError("You have to specify pixel_values or pixel_embeds")
|
||||
|
||||
if pixel_embeds is not None:
|
||||
hidden_states = pixel_embeds
|
||||
@@ -417,25 +434,21 @@ class InternVisionModel(nn.Module):
|
||||
if pixel_values.ndim == 4:
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'wrong pixel_values size: {pixel_values.shape}')
|
||||
raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
|
||||
|
||||
if self.use_data_parallel:
|
||||
encoder_outputs = run_dp_sharded_vision_model(
|
||||
hidden_states, self.encoder)
|
||||
encoder_outputs = run_dp_sharded_vision_model(hidden_states, self.encoder)
|
||||
else:
|
||||
encoder_outputs = self.encoder(inputs_embeds=hidden_states)
|
||||
|
||||
return encoder_outputs
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
return loaded_params
|
||||
|
||||
Reference in New Issue
Block a user