
    fThB                         S r SSKJr  SSKJr  SSKJr  SSKJr  \R                  " \
5      r " S S\5      r " S	 S
\5      rSS
/rg)zmT5 model configuration    )Mapping   )PretrainedConfig)OnnxSeq2SeqConfigWithPast)loggingc                   r   ^  \ rS rSrSrSrS/rSSSSS	.r                     SU 4S
 jjrSr	U =r
$ )	MT5Config   a  
This is the configuration class to store the configuration of a [`MT5Model`] or a [`TFMT5Model`]. It is used to
instantiate a mT5 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the mT5
[google/mt5-small](https://huggingface.co/google/mt5-small) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Arguments:
    vocab_size (`int`, *optional*, defaults to 250112):
        Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
    d_model (`int`, *optional*, defaults to 512):
        Size of the encoder layers and the pooler layer.
    d_kv (`int`, *optional*, defaults to 64):
        Size of the key, query, value projections per attention head. In the conventional context, it is typically expected that `d_kv` has to be equal to `d_model // num_heads`.
        But in the architecture of mt5-small, `d_kv` is not equal to `d_model //num_heads`. The `inner_dim` of the projection layer will be defined as `num_heads * d_kv`.
    d_ff (`int`, *optional*, defaults to 1024):
        Size of the intermediate feed forward layer in each `T5Block`.
    num_layers (`int`, *optional*, defaults to 8):
        Number of hidden layers in the Transformer encoder.
    num_decoder_layers (`int`, *optional*):
        Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
    num_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance of the longer sequences for the bucket separation.
    dropout_rate (`float`, *optional*, defaults to 0.1):
        The ratio for all dropout layers.
    classifier_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for classifier.
    layer_norm_eps (`float`, *optional*, defaults to 1e-6):
        The epsilon used by the layer normalization layers.
    initializer_factor (`float`, *optional*, defaults to 1):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
        Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
mt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimc           
        > Xl         X l        X0l        X@l        XPl        Ub  UOU R                  U l        Xpl        Xl        Xl        Xl	        UU l
        Xl        Xl        Xl        Xl        U R                  R                  S5      nUS   U l        US   S:H  U l        [%        U5      S:  a	  US   S:w  d  [%        U5      S:  a  ['        SU S35      eUS	:X  a  S
U l        [(        TU ]T  " SUUUUUUS.UD6  g )N-r   gated      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'
gated-gelugelu_new)is_encoder_decodertokenizer_classtie_word_embeddingspad_token_ideos_token_iddecoder_start_token_id )
vocab_sizer   r   d_ffr   num_decoder_layersr   relative_attention_num_bucketsrelative_attention_max_distancedropout_rateclassifier_dropoutlayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachesplitdense_act_fnis_gated_actlen
ValueErrorsuper__init__)selfr$   r   r   r%   r   r&   r   r'   r(   r)   r+   r,   r-   r   r.   r   r   r    r!   r"   r*   kwargsact_info	__class__s                           a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mt5/configuration_mt5.pyr5   MT5Config.__init__R   s"   2 %		$"4"@doo 	 #.L+/N,("4"4"4!2"))//4$RL$QK72x=1!!73x=1;L'(9': ;) )  , *D 	
1+ 3%%#9	
 	
    )r*   r%   r   r   r0   r)   r-   r,   r1   r+   r&   r   r   r(   r'   r.   r$   )i  i   @   i      N          g?gư>g      ?r   TTT5TokenizerFr   r   r   g        )__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr5   __static_attributes____classcell__)r9   s   @r:   r	   r	      sy    +Z J#4"5 *)	M ')(+&%! -B
 B
r<   r	   c                   p    \ rS rSr\S\\\\\4   4   4S j5       r\S\4S j5       r	\S\
4S j5       rSrg)MT5OnnxConfig   returnc                     SSS.SSS.S.nU R                   (       a  SUS   S'   SS0US	'   SS
S.US'   OSSS.US	'   SSS.US'   U R                   (       a  U R                  USS9  U$ )Nbatchencoder_sequence)r   r   )	input_idsattention_maskz past_encoder_sequence + sequencerU   r   r   decoder_input_idsz past_decoder_sequence + sequencedecoder_attention_maskdecoder_sequenceinputs)	direction)use_pastfill_with_past_key_values_)r6   common_inputss     r:   rY   MT5OnnxConfig.inputs   s     %);<").@A
 ==1SM*+A.23WM-.:AFh6iM235<AS1TM-.:AFX6YM23==++MX+Nr<   c                     g)N   r#   r6   s    r:   default_onnx_opset MT5OnnxConfig.default_onnx_opset   s     r<   c                     g)NgMb@?r#   ra   s    r:   atol_for_validation!MT5OnnxConfig.atol_for_validation   s    r<   r#   N)rC   rD   rE   rF   propertyr   strintrY   rb   floatre   rK   r#   r<   r:   rN   rN      sd    WS#X%6 67  $ C   U  r<   rN   N)rG   typingr   configuration_utilsr   onnxr   utilsr   
get_loggerrC   loggerr	   rN   __all__r#   r<   r:   <module>rr      sQ      3 -  
		H	%y
  y
x- > 
(r<   