
    fTh"                     t    S r SSKJrJrJr  SSKJr  SSKJr  \R                  " \
5      r " S S\5      rS/rg)	zProphetNet model configuration    )CallableOptionalUnion   )PretrainedConfig)loggingc            6         ^  \ rS rSrSrSrS/rSS0r                          S%S\\	   S\\
\\4      S	\\   S
\\   S\\   S\\   S\\   S\\   S\\   S\\   S\\	   S\\	   S\\   S\\	   S\\   S\\   S\\   S\\   S\\   S\\   S\\   S\\	   S\\   S\\   S\\   S\\   44U 4S  jjjr\S!\4S" j5       r\R$                  S# 5       rS$rU =r$ )&ProphetNetConfig   a  
This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used to instantiate a
ProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the ProphetNet
[microsoft/prophetnet-large-uncased](https://huggingface.co/microsoft/prophetnet-large-uncased) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    activation_dropout (`float`, *optional*, defaults to 0.1):
        The dropout ratio for activations inside the fully connected layer.
    activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    vocab_size (`int`, *optional*, defaults to 30522):
        Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`ProphetNetModel`].
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the layers and the pooler layer.
    encoder_ffn_dim (`int`, *optional*, defaults to 4096):
        Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
    num_encoder_layers (`int`, *optional*, defaults to 12):
        Number of encoder layers.
    num_encoder_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    decoder_ffn_dim (`int`, *optional*, defaults to 4096):
        Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
    num_decoder_layers (`int`, *optional*, defaults to 12):
        Number of decoder layers.
    num_decoder_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer decoder.
    attention_dropout (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities.
    dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    max_position_embeddings (`int`, *optional*, defaults to 512):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    init_std (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    add_cross_attention (`bool`, *optional*, defaults to `True`):
        Whether cross-attention layers should be added to the model.
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Whether this is an encoder/decoder model.
    pad_token_id (`int`, *optional*, defaults to 1)
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 0)
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 2)
        End of stream token id.
    ngram (`int`, *optional*, defaults to 2)
        Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
        token.
    num_buckets (`int`, *optional*, defaults to 32)
        The number of buckets to use for each attention layer. This is for relative position calculation. See the
        [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
    relative_max_distance (`int`, *optional*, defaults to 128)
        Relative distances greater than this number will be put into the last same bucket. This is for relative
        position calculation. See the [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
    disable_ngram_loss (`bool`, *optional*, defaults to `False`):
        Whether be trained predicting only the next first token.
    eps (`float`, *optional*, defaults to 0.0):
        Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
        smoothing is performed.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).

prophetnetpast_key_valuesnum_attention_headsnum_encoder_attention_headsactivation_dropoutactivation_function
vocab_sizehidden_sizeencoder_ffn_dimnum_encoder_layersdecoder_ffn_dimnum_decoder_layersnum_decoder_attention_headsattention_dropoutdropoutmax_position_embeddingsinit_stdis_encoder_decoderadd_cross_attentiondecoder_start_token_idngramnum_bucketsrelative_max_distancedisable_ngram_losseps	use_cachepad_token_idbos_token_ideos_token_idc           
      0  > X0l         X@l        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        X l
        UU l        UU l        UU l        UU l        UU l        Xl        Xl        Xl        UU l        [(        TU ]T  " SUUUUUUS.UD6  g )N)r&   r'   r(   r   r   r    )r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r   r   r   r%   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   kwargs	__class__s                               o/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/prophetnet/configuration_prophetnet.pyr,   ProphetNetConfig.__init__f   s    < %&."4+F(."4+F('>$ #6  
&%:""4 "3"4" 	
%%%1 3#9	
 	
    returnc                 4    U R                   U R                  -   $ )N)r   r   )r-   s    r0   num_hidden_layers"ProphetNetConfig.num_hidden_layers   s    &&)@)@@@r2   c                     [        S5      e)NzyThis model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and `num_decoder_layers`.)NotImplementedError)r-   values     r0   r5   r6      s    !%
 	
r2   )r   r   r   r   r#   r   r   r$   r   r   r   r    r!   r   r   r   r   r"   r%   r   )皙?gelui:w  i            r<   r=   r>   r:   r:   i   g{Gz?TTr             Fg        Tr      r?   )__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   floatr   strr   intboolr,   propertyr5   setter__static_attributes____classcell__)r/   s   @r0   r
   r
      sJ   CJ J#4"5<M /2>D$)%))-,.57)-,.57-0#&14$(-1.201 %'/2-2"$(&'&'&'7@
$UO@
 &eCM&:;@
 SM	@

 c]@
 "#@
 %SM@
 &.c]@
 "#@
 %SM@
 &.c]@
 $E?@
 %@
 "*#@
 5/@
  %TN!@
" &d^#@
$ !)%@
& }'@
( c])@
*  (}+@
, %TN-@
. e_/@
0 D>1@
2 sm3@
4 sm5@
6 sm7@
 @
D A3 A A 
 
r2   r
   N)rG   typingr   r   r   configuration_utilsr   utilsr   
get_loggerrC   loggerr
   __all__r*   r2   r0   <module>rY      sB    % , , 3  
		H	%W
' W
t 
r2   