Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss. See [here]() for more details
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Exaone
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
Configuration objects inherit from {class}`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs. Read the documentation from :
|
||||
class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
||||
vocab_size ({obj}`int`, `optional`, defaults to 50257):
|
||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||
different tokens that can be represented by the :obj:`inputs_ids`
|
||||
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
||||
different tokens that can be represented by the {obj}`inputs_ids`
|
||||
passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
|
||||
size of the model.
|
||||
Defines the different tokens that can be represented by the
|
||||
`inputs_ids` passed to the forward method of :class:
|
||||
`~transformers.EXAONEModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
||||
hidden_size ({obj}`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
num_layers ({obj}`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
|
||||
specified, will default to `num_attention_heads`.
|
||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||
percentage of hidden dimensions to allocate to rotary embeddings
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
||||
intermediate_size ({obj}`int`, `optional`, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||
the Transformer encoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
||||
defaults to :obj:`"gelu_new"`):
|
||||
activation_function ({obj}`str` or {obj}`function`, `optional`,
|
||||
defaults to {obj}`"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
||||
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
||||
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
|
||||
{obj}`"selu"` and {obj}`"gelu_new"` are supported.
|
||||
embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
||||
max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
||||
:class:`~transformers.EXAONEModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
type_vocab_size ({obj}`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the {obj}`token_type_ids` passed when calling
|
||||
{class}`~transformers.EXAONEModel`.
|
||||
initializer_range ({obj}`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
||||
layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
Only relevant if ``config.is_decoder=True``.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`,
|
||||
defaults to :obj:`False`):
|
||||
gradient_checkpointing ({obj}`bool`, `optional`,
|
||||
defaults to {obj}`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense
|
||||
of slower backward pass.
|
||||
Example::
|
||||
|
||||
Reference in New Issue
Block a user