
    fThC                         S r SSKJrJrJr  SSKJr  SSKJr  \R                  " \
5      r " S S\5      r " S S	\5      r " S
 S\5      r " S S\5      r " S S\5      r/ SQrg)zFLAVA model configurations    )AnyDictOptional   )PretrainedConfig)loggingc                      ^  \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\S\S\S\S\S\	S\	S\4U 4S jjjr
SrU =r$ )FlavaImageConfig   a)  
This is the configuration class to store the configuration of a [`FlavaImageModel`]. It is used to instantiate an
FLAVA model according to the specified arguments, defining the model architecture.

Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
[facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-12):
        The epsilon used by the layer normalization layers.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    qkv_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys and values.
    mask_token (`bool`, *optional*, defaults to `True`):
        Whether to use a mask token or not. Used in MIM (Masked Image Modeling) loss for FLAVA.
    vocab_size (`int`, *optional*, defaults to 8192):
        Vocabulary size of the [`FlavaImageCodebook`] used in conjunction with [`FlavaImageModel`] for MIM (Masked
        Image Modeling) loss for FLAVA.

Example:

```python
>>> from transformers import FlavaImageConfig, FlavaImageModel

>>> # Initializing a FlavaImageModel with  style configuration
>>> configuration = FlavaImageConfig()

>>> # Initializing a FlavaImageModel model (with random weights) from the style configuration
>>> model = FlavaImageModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```flava_image_modelimage_confighidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_acthidden_dropout_probattention_probs_dropout_probinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channelsqkv_bias
mask_token
vocab_sizec                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        g N )super__init__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                    e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/flava/configuration_flava.pyr!   FlavaImageConfig.__init__Z   si    & 	"6"&!2#6 !2$#6 ,H)!2,$$( $$    )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )      r)      gelu        r,   {Gz?-q=      r   TT    __name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyintfloatboolr!   __static_attributes____classcell__r$   s   @r%   r
   r
      s    :x %J$O !##%!% %(.1#' %!#%#% #% !	#%
 #% #% ##% ',#% !#% #% #% #% #% #% #%  !#% #%r'   r
   c                      ^  \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\S\S\	S\	S\	S\	S\S\
4U 4S jjjrSrU =r$ )FlavaTextConfig   aS  
This is the configuration class to store the configuration of a [`FlavaTextModel`]. It is used to instantiate an
FLAVA model according to the specified arguments, defining the model architecture.

Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
[facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 30522):
        Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`FlavaTextModel`].
    type_vocab_size (`int`, *optional*, defaults to 2):
        The vocabulary size of the `token_type_ids` passed when calling [`FlavaTextModel`]. Note that even though
        text encoder allows `token_type_ids`'s value as 2, for text-only pretraining and fine-tuning, only 1 is
        used similar to RoBERTa.
    max_position_embeddings (`int`, *optional*, defaults to 512):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77.
    position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
        Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
        positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
        [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
        For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
        with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-12):
        The epsilon used by the layer normalization layers.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    qkv_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys and values.

Example:

```python
>>> from transformers import FlavaTextConfig, FlavaTextModel

>>> # Initializing a FlavaTextModel with  style configuration
>>> configuration = FlavaTextConfig()

>>> # Initializing a FlavaTextModel model (with random weights) from the style configuration
>>> model = FlavaTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```flava_text_modeltext_configr   type_vocab_sizemax_position_embeddingsposition_embedding_typer   r   r   r   r   r   r   r   r   pad_token_idr   c                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        g r   )r    r!   r   rE   rF   rG   r   r   r   r   r   r   r   r   r   r   rH   )r"   r   rE   rF   rG   r   r   r   r   r   r   r   r   r   rH   r   r#   r$   s                    r%   r!   FlavaTextConfig.__init__   sl    & 	"6"$.'>$'>$&!2#6 !2$#6 ,H)!2, (r'   )r   r   r   r   r   r   r   rF   r   r   rH   rG   r   rE   r   )i:w     i   absoluter(   r)   r)   r*   r+   r,   r,   r-   r.   r   T)r3   r4   r5   r6   r7   r8   r9   r:   strr;   r<   r!   r=   r>   r?   s   @r%   rA   rA      s    EN $J#O   '*'1!##%!% %(.1#' %!#)#) #) "%	#)
 "%#) #) #) !#) #) #) ##) ',#) !#) #) #)  !#) #)r'   rA   c                   ~   ^  \ rS rSrSrSrSr           SS\S\S\S\S	\S
\S\S\S\S\	S\	4U 4S jjjr
SrU =r$ )FlavaMultimodalConfig   af
  
This is the configuration class to store the configuration of a [`FlavaMultimodalModel`]. It is used to instantiate
an FLAVA model according to the specified arguments, defining the model architecture.

Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
[facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 6):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-12):
        The epsilon used by the layer normalization layers.
    qkv_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys and values.
    use_cls_token (`bool`, *optional*, defaults to `True`):
        Whether to use an extra CLS token for multimodal settings. Usually needed by the FLAVA model.


Example:

```python
>>> from transformers import FlavaMultimodalConfig, FlavaMultimodalModel

>>> # Initializing a FlavaMultimodalModel with  style configuration
>>> configuration = FlavaMultimodalConfig()

>>> # Initializing a FlavaMultimodalModel model (with random weights) from the style configuration
>>> model = FlavaMultimodalModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```flava_multimodal_modelmultimodal_configr   r   r   r   r   r   r   r   r   r   use_cls_tokenc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        g r   )r    r!   r   r   r   r   r   r   r   r   r   r   rS   )r"   r   r   r   r   r   r   r   r   r   r   rS   r#   r$   s                r%   r!   FlavaMultimodalConfig.__init__)  sU     	"6"&!2#6 !2$#6 ,H)!2, *r'   )r   r   r   r   r   r   r   r   r   r   rS   )r(      r)   r*   r+   r,   r,   r-   r.   TTr2   r?   s   @r%   rO   rO      s    2h *J)O !"#%!% #&,/#' %"++ + !	+
 + + !+ '*+ !+ + + + +r'   rO   c                   d   ^  \ rS rSrSrSr        SS\S\S\S\S\S	\S
\4U 4S jjjrSr	U =r
$ )FlavaImageCodebookConfigiG  flava_image_codebookimage_codebook_config
num_groupsinput_channelsnum_blocks_per_groupr   r   freezer   c                 z   > [         T	U ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        g r   )	r    r!   r[   r\   r]   r   r   r^   r   )
r"   r[   r\   r]   r   r   r^   r   r#   r$   s
            r%   r!   !FlavaImageCodebookConfig.__init__v  s<     	"6"$,$8!&$!2r'   )r^   r   r   r\   r]   r[   r   )   r   rK      r1   Tr-   )r3   r4   r5   r6   r8   r9   r:   r;   r!   r=   r>   r?   s   @r%   rX   rX   G  sx    'J-O)Z $%#'33 3 "	3
 3 3 3 !3 3r'   rX   c            )       "  ^  \ rS rSrSrSr\\\\	S.r
                    SS\\\\4      S\\\\4      S\\\\4      S\\\\4      S	\S
\S\S\S\S\S\S\S\S\S\S\S\S\S\S\4(U 4S jjjr\S\S\S\S\	4S j5       rSrU =r$ )FlavaConfigi  a  
[`FlavaConfig`] is the configuration class to store the configuration of a [`FlavaModel`]. It is used to
instantiate FLAVA model according to the specified arguments, defining the text model, image model, image codebook
and multimodal model configs. Instantiating a configuration with the defaults will yield a similar configuration to
that of the FLAVA [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`FlavaTextConfig`].
    image_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`FlavaImageConfig`].
    multimodal_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    layer_norm_eps (`float`, *optional*, defaults to 1e-12):
        The epsilon used by the layer normalization layers.
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and image projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP
        implementation.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    ce_ignore_index (`int`, *optional*, defaults to -100):
        Cross entropy index to ignore.
    mim_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to MIM (Masked Image Modeling) unimodal loss
    mlm_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to MLM (Masked Language Modeling) unimodal loss
    global_contrastive_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to global contrastive cross-alignment loss.
    itm_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to image-text matching multimodal loss.
    mmm_image_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to MMM loss's image part.
    mmm_text_weight (`float`, *optional*, defaults to 1.0):
        Weight to be assigned to MMM loss's text part.
    global_backprop_contrastive (`bool`, *optional*, defaults to `True`):
        Whether to use global backpropgation through all workers in contrastive loss.
    skip_unmasked_multimodal_encoder (`bool`, *optional*, defaults to `True`):
        Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
    return_loss (`bool`, *optional*, defaults to `True`):
        Whether to return loss or not

    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining

>>> # Initializing a FlavaConfig with style configuration
>>> configuration = FlavaConfig()

>>> # Initializing a FlavaModel and FlavaForPreTraining model (with random weights) from the style configuration
>>> model = FlavaModel(configuration)
>>> model_pre = FlavaForPreTraining(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
>>> configuration_pre = model_pre.config
```
flava)rD   r   rR   rZ   r   rD   rR   rZ   r   r   projection_diminit_codebooklogit_scale_init_valuer   ce_ignore_index
mim_weight
mlm_weightglobal_contrastive_weight
itm_weightmmm_image_weightmmm_text_weightglobal_backprop_contrastive skip_unmasked_multimodal_encoderreturn_lossc                   > UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS 5      n[        T!U ]  " S0 UD6  Ub  Uc  0 n[        S0 UD6R	                  5       nUR                  5        HL  u  nnUU;   d  M  UUU   :w  d  M  US;  d  M!  UU;   a
  SU SU S3nOS	U S
3n[        R                  U5        MN     UR                  U5        Ub  Uc  0 n[        S0 UD6R	                  5       nSU;   a6  US   R                  5        VVs0 s H  u  nn[        U5      U_M     snnUS'   UR                  5        HL  u  nnUU;   d  M  UUU   :w  d  M  US;  d  M!  UU;   a
  SU SU S3nOSU S
3n[        R                  U5        MN     UR                  U5        Ub  Uc  0 n[        S0 UD6R	                  5       nUR                  5        HL  u  nnUU;   d  M  UUU   :w  d  M  US;  d  M!  UU;   a
  SU SU S3nOSU S
3n[        R                  U5        MN     UR                  U5        Ub  Uc  0 n[        S0 UD6R	                  5       n U R                  5        HL  u  nnUU;   d  M  UUU   :w  d  M  US;  d  M!  UU;   a
  SU SU S3nOSU S
3n[        R                  U5        MN     UR                  U 5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [        S0 UD6U l        [        S0 UD6U l        [        S0 UD6U l        [        S0 UD6U l        Xpl        Xl        XPl        X`l        Xl        Xl        SU l        Xl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l         UU l!        g s  snnf )Ntext_config_dictimage_config_dictmultimodal_config_dictimage_codebook_config_dict)transformers_version`zp` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["z"]` will be used instead.zk`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["z"]` will be overridden.id2labelzs` is found in both `image_config_dict` and `image_config` but with different values. The value `image_config_dict["zn`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. The value `image_config["z` is found in both `multimodal_config_dict` and `multimodal_config` but with different values. The value `multimodal_config_dict["z}`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["z` is found in both `image_codebook_config_dict` and `image_codebook_config` but with different values. The value `image_codebook_config_dict["z`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["zR`image_config` is `None`. initializing the `FlavaImageConfig` with default values.zP`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.z\`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.zc`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values.      ?r   )"popr    r!   rA   to_dictitemsloggerinfoupdater
   rM   rO   rX   r   rD   rR   rZ   rf   rg   r   r   r   rh   initializer_factorri   rj   rk   rl   rm   rn   ro   rp   rq   rr   )"r"   r   rD   rR   rZ   r   r   rf   rg   rh   r   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   r#   rt   ru   rv   rw   _text_config_dictkeyvaluemessage_image_config_dict_multimodal_config_dict_image_codebook_config_dictr$   s"                                    r%   r!   FlavaConfig.__init__  s   6 "::&8$?"JJ':DA!',Dd!K%+ZZ0Ld%S""6"
 '"  !0 C2B C K K M 0557
U+%%;s3C*CSkHk..u %<<?5@Y[  336%7NP   KK( 8" 01(#! "2!F4E!F!N!N!P//6H6T6Z6Z6\26\
UCHeO6\2":.
 1668
U,&5L4E+E#UmJm//u %EEHEIbd  88;u<SU   KK( 9"  23!- ($&! '<&U>T&U&]&]&_# 6;;=
U,,!23!77#;; 44u %TTWSXXqs  VVYUZZqs   KK(% >* $$%<=%1$,(*% +C*`E_*`*h*h*j' :??A
U00!6s!;;#;; 88u %]]`\a b''  ]]`\aaxz   KK(' B, "(()DELKKlmKKKjk$ "KKvw ($&!KKu -<|<*9[9!6!K9J!K%=%V@U%V",*&,!2&<#"%.$$)B&$ 0.+F(0P-&A2s   )O3c                     U " SUR                  5       UR                  5       UR                  5       UR                  5       S.UD6$ )z
Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
configuration, flava multimodal model and flava codebook model configuration.

Returns:
    [`FlavaConfig`]: An instance of a configuration object
)r   rD   rR   rZ   r   )r}   )clsr   rD   rR   rZ   r#   s         r%   from_configsFlavaConfig.from_configs  sP    "  
%--/#++-/779"7"?"?"A	

 
 	
r'   )ri   rp   rl   r   rZ   r   rg   r   r   rm   r   rh   rj   rk   rn   ro   rR   rf   rr   rq   rD   )NNNNr(   r.   r(   Tg/L
F@r-   ir{   r{   r{   r{   r{   r{   TTT)r3   r4   r5   r6   r7   r8   rA   r
   rO   rX   sub_configsr   r   rM   r   r:   r;   r<   r!   classmethodr   r=   r>   r?   s   @r%   rd   rd     s   CJ J&(2!9	K 26046::> %!"(.#'#+."%!$,015 +H'tCH~.H' d38n-H' $DcN3	H'
  (S#X7H' H' H' H' H' !&H' !H' H' H' H' $)H'  !H'"  #H'$ %H'& &*'H'( +/)H'* +H' H'T 
&
 %
 1	

  8
 
r'   rd   )rd   rX   r
   rO   rA   N)r7   typingr   r   r   configuration_utilsr   utilsr   
get_loggerr3   r   r
   rA   rO   rX   rd   __all__r   r'   r%   <module>r      s    ! & & 3  
		H	%c%' c%Ln)& n)bS+, S+lA3/ A3Ho
" o
d	 vr'   