o
    ZŽh<  ã                   @   sj   d Z ddlmZ ddlmZ ddlmZ ddlmZ e 	e
¡ZG dd„ deƒZG d	d
„ d
eƒZdd
gZdS )zUMT5 model configurationé    )ÚMappingé   )ÚPretrainedConfig)ÚOnnxSeq2SeqConfigWithPast)Úloggingc                       sd   e Zd ZdZdZdgZdddddœZ			
																			d‡ fdd„	Z‡  ZS )Ú
UMT5ConfigaR  
    This is the configuration class to store the configuration of a [`UMT5Model`]. It is used to instantiate a UMT5
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the UMT5
    [google/umt5-small](https://huggingface.co/google/umt5-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 250112):
            Vocabulary size of the UMT5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`UMT5Model`] or [`TFUMT5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
            num_heads`.
        d_ff (`int`, *optional*, defaults to 1024):
            Size of the intermediate feed forward layer in each `UMT5Block`.
        num_layers (`int`, *optional*, defaults to 8):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    Zumt5Zpast_key_valuesÚd_modelÚ	num_headsÚ
num_layersÚd_kv)Zhidden_sizeZnum_attention_headsZnum_hidden_layersZhead_dimé Ñ é   é@   é   é   Né   é    é€   çš™™™™™¹?çíµ ÷Æ°>ç      ð?ú
gated-geluTÚT5Tokenizerr   é   ç        c              	      sò   || _ || _|| _|| _|| _|d ur|n| j| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _| j d¡}|d | _|d dk| _t|ƒdkrR|d dksXt|ƒdkr`td|› dƒ‚|d	krgd
| _tƒ jd||||||dœ|¤Ž d S )Nú-éÿÿÿÿr   Zgatedr   é   z`feed_forward_proj`: z© is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'r   Zgelu_new)Úis_encoder_decoderÚtokenizer_classÚtie_word_embeddingsÚpad_token_idÚeos_token_idÚdecoder_start_token_id© )Ú
vocab_sizer   r   Úd_ffr
   Únum_decoder_layersr	   Úrelative_attention_num_bucketsÚrelative_attention_max_distanceÚdropout_rateÚclassifier_dropoutÚlayer_norm_epsilonÚinitializer_factorÚfeed_forward_projÚ	use_cacheÚsplitZdense_act_fnZis_gated_actÚlenÚ
ValueErrorÚsuperÚ__init__)Úselfr%   r   r   r&   r
   r'   r	   r(   r)   r*   r,   r-   r.   r   r/   r   r    r!   r"   r#   r+   ÚkwargsZact_info©Ú	__class__r$   úZ/var/www/auris/lib/python3.10/site-packages/transformers/models/umt5/configuration_umt5.pyr4   R   sF   ÿ
$
ÿú
ùzUMT5Config.__init__)r   r   r   r   r   Nr   r   r   r   r   r   r   TTr   Tr   r   r   r   )	Ú__name__Ú
__module__Ú__qualname__Ú__doc__Z
model_typeZkeys_to_ignore_at_inferenceZattribute_mapr4   Ú__classcell__r$   r$   r7   r9   r      s>    -ü	êr   c                   @   sR   e Zd Zedeeeeef f fdd„ƒZedefdd„ƒZede	fdd„ƒZ
dS )	ÚUMT5OnnxConfigÚreturnc                 C   sx   dddœdddœdœ}| j r"d|d d< ddi|d	< dd
dœ|d< ndddœ|d	< dddœ|d< | j r:| j|dd |S )NÚbatchZencoder_sequence)r   r   )Z	input_idsÚattention_maskz past_encoder_sequence + sequencerB   r   r   Zdecoder_input_idsz past_decoder_sequence + sequenceZdecoder_attention_maskZdecoder_sequenceÚinputs)Ú	direction)Zuse_pastZfill_with_past_key_values_)r5   Zcommon_inputsr$   r$   r9   rC   —   s   þzUMT5OnnxConfig.inputsc                 C   ó   dS )Né   r$   ©r5   r$   r$   r9   Údefault_onnx_opset«   s   z!UMT5OnnxConfig.default_onnx_opsetc                 C   rE   )Ngü©ñÒMb@?r$   rG   r$   r$   r9   Úatol_for_validation°   s   z"UMT5OnnxConfig.atol_for_validationN)r:   r;   r<   Úpropertyr   ÚstrÚintrC   rH   ÚfloatrI   r$   r$   r$   r9   r?   –   s     r?   N)r=   Útypingr   Zconfiguration_utilsr   Zonnxr   Úutilsr   Z
get_loggerr:   Úloggerr   r?   Ú__all__r$   r$   r$   r9   Ú<module>   s   
{