[Model] Align nemotron config with final HF state and fix lm-eval-small (#7611)
This commit is contained in:
@@ -35,20 +35,20 @@ class NemotronConfig(PretrainedConfig):
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*, defaults to None):
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
@@ -63,16 +63,16 @@ class NemotronConfig(PretrainedConfig):
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
@@ -80,21 +80,16 @@ class NemotronConfig(PretrainedConfig):
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE
|
||||
embeddings. Currently supports two scaling strategies: linear
|
||||
and dynamic. Their scaling factor must be a float greater than 1.
|
||||
The expected format is `{"type": strategy name,
|
||||
"factor": scaling factor}`. When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
@@ -106,13 +101,10 @@ class NemotronConfig(PretrainedConfig):
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
@@ -140,7 +132,7 @@ class NemotronConfig(PretrainedConfig):
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
rope_percent=0.5,
|
||||
partial_rotary_factor=0.5,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
@@ -167,8 +159,10 @@ class NemotronConfig(PretrainedConfig):
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
rope_percent = rope_percent or kwargs.get("rope_percentage", None)
|
||||
self.rope_percent = rope_percent
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get(
|
||||
"rope_percentage", None) or partial_rotary_factor
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
Reference in New Issue
Block a user