|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" PagnolXl configuration""" |
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class PagnolXlConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`PagnolXlModel`]. It is used to instantiate a PagnolXl |
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
|
defaults will yield a similar configuration to that of the [PagnolXl]() architecture. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
vocab_size (`int`, *optional*, defaults to 65024): |
|
Vocabulary size of the PagnolXl model. Defines the number of different tokens that can be represented by the |
|
`inputs_ids` passed when calling [`PagnolXlModel`] |
|
d_model (`int`, *optional*, defaults to 4544): |
|
Dimension of the hidden representations. |
|
num_hidden_layers (`int`, *optional*, defaults to 32): |
|
Number of hidden layers in the Transformer decoder. |
|
n_heads (`int`, *optional*, defaults to 71): |
|
Number of attention heads for each attention layer in the Transformer encoder. |
|
sigma (`float`, *optional*, defaults to 0.02): |
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
|
use_cache (`bool`, *optional*, defaults to `True`): |
|
Whether the model should return the last key/values attentions (not used by all models). Only relevant if |
|
`config.is_decoder=True`. |
|
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): |
|
The epsilon used by the layer normalization layers. |
|
dropout (`float`, *optional*, defaults to 0.0): |
|
The dropout probability for MLP layers. |
|
bos_token_id (`int`, *optional*, defaults to 11): |
|
The id of the "beginning-of-sequence" token. |
|
eos_token_id (`int`, *optional*, defaults to 11): |
|
The id of the "end-of-sequence" token. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import PagnolXlModel, PagnolXlConfig |
|
|
|
>>> # Initializing a small (2-layer) PagnolXl configuration |
|
>>> configuration = PagnolXlConfig(num_hidden_layers=2) |
|
|
|
>>> # Initializing a model from the small configuration |
|
>>> model = PagnolXlModel(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "pagnolxl" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
vocab_size=65024, |
|
activation_function="gelu", |
|
d_model=4544, |
|
d_feedforward=18176, |
|
n_heads=71, |
|
n_layers=32, |
|
layer_norm_epsilon=1e-5, |
|
sigma=0.02, |
|
use_cache=True, |
|
dropout=0.0, |
|
bos_token_id=11, |
|
eos_token_id=11, |
|
**kwargs, |
|
): |
|
self.vocab_size = vocab_size |
|
|
|
n_embed = kwargs.pop("n_embed", None) |
|
self.activation_function = activation_function |
|
self.d_model = d_model if n_embed is None else n_embed |
|
self.d_feedforward = d_feedforward |
|
self.n_heads = n_heads |
|
self.n_layers = n_layers |
|
self.layer_norm_epsilon = layer_norm_epsilon |
|
self.sigma = sigma |
|
self.use_cache = use_cache |
|
self.dropout = dropout |
|
self.bos_token_id = bos_token_id |
|
self.eos_token_id = eos_token_id |
|
|
|
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) |
|
|
|
@property |
|
def head_dim(self): |
|
return self.d_model // self.n_heads |
|
|