
    fTh:                         S r SSKJr  SSKJr  \R
                  " \5      r " S S\5      r " S S\5      r	 " S S	\5      r
/ S
Qrg)zBridgeTower model configuration   )PretrainedConfig)loggingc                   L   ^  \ rS rSrSrSrSr          SU 4S jjrSrU =r	$ )BridgeTowerVisionConfig   aC  
This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a
configuration with the defaults will yield a similar configuration to that of the bridgetower-base
[BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in visual encoder model.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    image_size (`int`, *optional*, defaults to 288):
        The size (resolution) of each image.
    initializer_factor (`float`, *optional*, defaults to 1):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    stop_gradient (`bool`, *optional*, defaults to `False`):
        Whether to stop gradient for training.
    share_layernorm (`bool`, *optional*, defaults to `True`):
        Whether LayerNorm layers are shared.
    remove_last_layer (`bool`, *optional*, defaults to `False`):
        Whether to remove the last layer from the vision encoder.


Example:

```python
>>> from transformers import BridgeTowerVisionConfig

>>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model
>>> configuration = BridgeTowerVisionConfig()

>>> # Accessing the configuration
>>> configuration
```bridgetower_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        g N )super__init__hidden_sizenum_hidden_layersnum_channels
patch_size
image_sizeinitializer_factorlayer_norm_epsstop_gradientshare_layernormremove_last_layer)selfr   r   r   r   r   r   r   r   r   r   kwargs	__class__s               q/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bridgetower/configuration_bridgetower.pyr    BridgeTowerVisionConfig.__init__F   sO     	"6"&!2($$"4,*.!2    )
r   r   r   r   r   r   r   r   r   r   )
      r      i      h㈵>FTF
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r   s   @r   r   r      s?    (T ,J%O 3 3r   r   c                   Z   ^  \ rS rSrSrSrSr                 SU 4S jjrSrU =r	$ )BridgeTowerTextConfiga   a  
This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here
are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that
of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/)
architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 50265):
        Vocabulary size of the text part of the model. Defines the number of different tokens that can be
        represented by the `inputs_ids` passed when calling [`BridgeTowerModel`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities.
    max_position_embeddings (`int`, *optional*, defaults to 514):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    type_vocab_size (`int`, *optional*, defaults to 2):
        The vocabulary size of the `token_type_ids`.
    initializer_factor (`float`, *optional*, defaults to 1):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
        Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
        positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
        [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
        For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
        with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
    is_decoder (`bool`, *optional*, defaults to `False`):
        Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.

Example:

```python
>>> from transformers import BridgeTowerTextConfig

>>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model
>>> configuration = BridgeTowerTextConfig()

>>> # Accessing the configuration
>>> configuration
```bridgetower_text_modeltext_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        Xpl        XPl        X`l        Xl	        Xl
        Xl        Xl        Xl        UU l        UU l        Xl        Xl        Xl        g r   )r   r   
vocab_sizer   r   num_attention_heads
hidden_actr   intermediate_sizehidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizer   position_embedding_type	use_cachepad_token_idbos_token_ideos_token_id)r   r5   r   r   r6   r   r8   r7   r9   r:   r;   r<   r   r?   r@   rA   r=   r>   r   r   s                      r   r   BridgeTowerTextConfig.__init__   s|    * 	"6"$&!2#6 $"4!2#6 ,H)'>$.,'>$"(((r   )r:   r@   rA   r7   r9   r   r   r8   r   r;   r6   r   r?   r=   r<   r>   r5   )iY  r   r    r    r"   i   gelu皙?rD   i  r"   r#   r"          absoluteTr$   r.   s   @r   r0   r0   a   sT    <| *J#O %( # *%') ')r   r0   c                   t   ^  \ rS rSrSrSr\\S.r             S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )BridgeTowerConfig   a
  
This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a
BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the bridgetower-base
[BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
        Whether cross modal transformer layers are shared.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler.
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    initializer_factor (`float`, *optional*, defaults to 1):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    share_link_tower_layers (`bool`, *optional*, defaults to `False`):
        Whether the bride/link tower layers are shared.
    link_tower_type (`str`, *optional*, defaults to `"add"`):
        Type of the bridge/link layer.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 6):
        Number of hidden layers in the Transformer encoder.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether to tie input and output embeddings.
    init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`):
        Whether to init LayerNorm from the vision encoder.
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`].

Example:

```python
>>> from transformers import BridgeTowerModel, BridgeTowerConfig

>>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration
>>> configuration = BridgeTowerConfig()

>>> # Initializing a model from the BridgeTower/bridgetower-base style configuration
>>> model = BridgeTowerModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```bridgetowerr3   r	   c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl	        Xl
        Xl        Xl        Xl        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [!        S0 UD6U l        [%        S0 UD6U l        g )Ntext_config_dictvision_config_dictzV`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.zZ`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.r   )popr   r   $share_cross_modal_transformer_layersr7   r   r   r   share_link_tower_layerslink_tower_typer6   r   tie_word_embeddings"init_layernorm_from_vision_encoderloggerinfor0   r3   r   r	   )r   rQ   r7   r   r   r   rR   rS   r6   r   rT   rU   r3   r	   r   _r   s                   r   r   BridgeTowerConfig.__init__  s    $ JJ)40JJ+T2"6"4X1$&"4,'>$.#6 !2#6 2T/KKKpq MKKtu0?;?4E}Er   r3   r	   c                 P    U " SUR                  5       UR                  5       S.UD6$ )z
Instantiate a [`BridgeTowerConfig`] (or a derived class) from BridgeTower text model configuration. Returns:
    [`BridgeTowerConfig`]: An instance of a configuration object
rL   r   )to_dict)clsr3   r	   r   s       r   from_text_vision_configs*BridgeTowerConfig.from_text_vision_configs3  s,     f{224MDYDYD[f_effr   )r7   r   rU   r   r   rS   r6   r   rQ   rR   r3   rT   r	   )TrC   r   r"   r#   Faddr       FFNN)r%   r&   r'   r(   r)   r*   r0   r   sub_configsr   classmethodr]   r,   r-   r.   s   @r   rI   rI      ss    3j J"7JabK .2 %!+0+FZ g/g@Wg gr   rI   )rI   r0   r   N)r)   configuration_utilsr   utilsr   
get_loggerr%   rV   r   r0   rI   __all__r   r   r   <module>rg      s\    & 3  
		H	%F3. F3Ri), i)Xog( ogd Tr   