Remove padding_index from models that don't use it for better Transformers v5 compatibility (#35189)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2026-02-24 16:04:46 +00:00
committed by GitHub
parent 60da0e1544
commit c38b8d5a31
14 changed files with 0 additions and 14 deletions

View File

@@ -421,7 +421,6 @@ class Ernie4_5_MoeModel(nn.Module):
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.config = config
parallel_config = vllm_config.parallel_config

View File

@@ -523,7 +523,6 @@ class Ernie4_5_VLMoeModel(nn.Module):
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.config = config

View File

@@ -157,7 +157,6 @@ class GraniteMoeSharedModel(nn.Module):
self.config = config
self.quant_config = quant_config # Required by MixtralModel
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size

View File

@@ -451,7 +451,6 @@ class Grok1Model(nn.Module):
self.config = config
self.quant_config = quant_config
self.padding_idx = config.pad_token_id
# Store expert naming for weight loading
self.ckpt_gate_proj_name = ckpt_gate_proj_name

View File

@@ -600,7 +600,6 @@ class HunYuanModel(nn.Module):
self.config = config
self.quant_config = quant_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size

View File

@@ -305,7 +305,6 @@ class Jais2Model(nn.Module):
self.config = config
self.quant_config = quant_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.org_vocab_size = config.vocab_size

View File

@@ -393,7 +393,6 @@ class KimiLinearModel(nn.Module):
parallel_config = vllm_config.parallel_config
self.config = config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
if get_pp_group().is_first_rank:

View File

@@ -486,7 +486,6 @@ class FlashModel(nn.Module):
quant_config = vllm_config.quant_config
self.config = config
self.padding_idx = getattr(config, "pad_token_id", None)
self.vocab_size = config.vocab_size
if get_pp_group().is_first_rank:

View File

@@ -495,7 +495,6 @@ class MiniMaxText01Model(nn.Module):
cache_config = vllm_config.cache_config
scheduler_config = vllm_config.scheduler_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.decoder_attention_types = getattr(

View File

@@ -241,7 +241,6 @@ class DeciModel(nn.Module):
self.config = config
self.quant_config = quant_config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size

View File

@@ -1029,7 +1029,6 @@ class OpenPanguModel(nn.Module):
self.config = config
self.num_redundant_experts = eplb_config.num_redundant_experts
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
if get_pp_group().is_first_rank or (

View File

@@ -748,7 +748,6 @@ class Plamo2Model(torch.nn.Module):
config = vllm_config.model_config.hf_config
self.config = config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = VocabParallelEmbedding(

View File

@@ -317,7 +317,6 @@ class Plamo3Model(nn.Module):
config = vllm_config.model_config.hf_config
self.config = config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.org_vocab_size = config.vocab_size

View File

@@ -443,7 +443,6 @@ class Qwen3MoeModel(nn.Module):
eplb_config = parallel_config.eplb_config
self.num_redundant_experts = eplb_config.num_redundant_experts
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.config = config
self.quant_config = quant_config