[Model] Align nemotron config with final HF state and fix lm-eval-small (#7611)

2024-08-16 18:56:34 -04:00
parent 37fd47e780
commit 44f26a9466
5 changed files with 29 additions and 35 deletions
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -35,20 +35,20 @@ class NemotronConfig(PretrainedConfig):


    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Nemotron model. Defines the number of
            different tokens that can be represented by the
            `inputs_ids` passed when calling [`NemotronModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
+        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
+        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
+        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the
            Transformer decoder.
-        head_dim (`int`, *optional*, defaults to None):
+        head_dim (`int`, *optional*):
            Projection weights dimension in multi-head attention. Set to
            hidden_size // num_attention_heads if None
        num_key_value_heads (`int`, *optional*):
@@ -63,16 +63,16 @@ class NemotronConfig(PretrainedConfig):
            heads within that group. For more details checkout 
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
            is not specified, will default to `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the
            decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.0134):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
-        norm_eps (`float`, *optional*, defaults to 1e-06):
+        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
@@ -80,21 +80,16 @@ class NemotronConfig(PretrainedConfig):
            `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
+        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
+        eos_token_id (`int`, *optional*, defaults to 3):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE
-            embeddings. Currently supports two scaling strategies: linear
-            and dynamic. Their scaling factor must be a float greater than 1.
-            The expected format is `{"type": strategy name,
-            "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
@@ -106,13 +101,10 @@ class NemotronConfig(PretrainedConfig):

    ```python
    >>> from transformers import NemotronModel, NemotronConfig
-
    >>> # Initializing a Nemotron nemotron-15b style configuration
    >>> configuration = NemotronConfig()
-
    >>> # Initializing a model from the nemotron-15b style configuration
    >>> model = NemotronModel(configuration)
-
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
@@ -140,7 +132,7 @@ class NemotronConfig(PretrainedConfig):
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
-        rope_percent=0.5,
+        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
@@ -167,8 +159,10 @@ class NemotronConfig(PretrainedConfig):
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
-        rope_percent = rope_percent or kwargs.get("rope_percentage", None)
-        self.rope_percent = rope_percent
+        # for backward compatibility
+        partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get(
+            "rope_percentage", None) or partial_rotary_factor
+        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout