
    fTh/                         S r SSKJr  SSKJr  SSKJrJr  \R                  " \	5      r
 " S S\5      r " S S	\5      r " S
 S\5      rS/rg)zIdefics2 model configuration   )PretrainedConfig)logging   )CONFIG_MAPPING
AutoConfigc                   N   ^  \ rS rSrSrSrSr           SU 4S jjrSrU =r	$ )Idefics2VisionConfig   a
  
This is the configuration class to store the configuration of a [`Idefics2VisionModel`]. It is used to instantiate a
Idefics2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics2 model
[HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation for initializing all weight matrices in the model.

Example:

```python
>>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
>>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig

>>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = Idefics2VisionConfig()

>>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = Idefics2VisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```idefics2_visionvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        Xl        g )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_actinitializer_range)selfr   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics2/configuration_idefics2.pyr   Idefics2VisionConfig.__init__O   sT     	"6"&!2!2#6 ($$!2,$!2    )r   r   r   r   r   r   r   r   r   r   r   )i   i      r"   r          gelu_pytorch_tanhư>        {Gz?)
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r   s   @r   r	   r	      sB    1f #J%O &3 3r!   r	   c                   H   ^  \ rS rSrSrSr          SU 4S jjrSrU =r$ )Idefics2PerceiverConfigm   a  
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the perceiver block.
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    resampler_n_latents (`int`, *optional*, defaults to 64):
        Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
    resampler_depth (`int`, *optional*, defaults to 3):
        Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
    resampler_n_heads (`int`, *optional*, defaults to 16):
        Number of heads in each Transformer block (for multi-headed self-attention).
    resampler_head_dim (`int`, *optional*, defaults to 96):
        Dimensionality of each head projection in the Transformer block.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        Number of key-value heads in the perceiver attention block.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation for initializing all weight matrices in the model.
idefics2_perceiverc                   > Xl         X l        X0l        X@l        XPl        X`l        Xl        Xpl        Xl        Xl	        U R                  U R
                  :  a%  [        SU R                   SU R
                   35      e[        TU ]0  " S0 UD6  g )Nznum_key_value_heads=z1 must be less than or equal to resampler_n_heads=r   )r   r   rms_norm_epsresampler_n_latentsresampler_depthresampler_n_headsnum_key_value_headsresampler_head_dimr   r   
ValueErrorr   r   )r   r   r   r8   r9   r:   r;   r=   r<   r   r   r   r   s               r   r    Idefics2PerceiverConfig.__init__   s     %&(#6 .!2#6 "4!2!2##d&<&<<&t'?'?&@ A&&*&<&<%=?  	"6"r!   )
r   r   r   r   r<   r:   r=   r;   r9   r8   )
silui   r&   @   r      `      r'   r(   )	r)   r*   r+   r,   r-   r.   r   r0   r1   r2   s   @r   r4   r4   m   s9    6 &J # #r!   r4   c                   L   ^  \ rS rSrSrSr\\\S.r	      SU 4S jjr
SrU =r$ )Idefics2Config   aU  
This is the configuration class to store the configuration of a [`Idefics2Model`]. It is used to instantiate a
Idefics2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the Idefics2
[HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should cache the key/value pairs of the attention mechanism.
    image_token_id (`int`, *optional*, defaults to 32001):
        The id of the "image" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to tie the word embeddings with the token embeddings.
    vision_config (`IdeficsVisionConfig` or `dict`, *optional*):
        Custom vision config or dict
    perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*):
        Custom perceiver config or dict
    text_config (`MistralConfig` or `dict`, *optional*):
        Custom text config or dict for the text model

Example:
```python
>>> from transformers import Idefics2Model, Idefics2Config
>>> # Initializing configuration
>>> configuration = Idefics2Config()
>>> # Initializing a model from the configuration
>>> model = Idefics2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```idefics2)text_configperceiver_configr   c                   > X l         Xl        X0l        Uc%  [        5       U l        [
        R                  S5        OA[        U[        5      (       a  [        S0 UD6U l        O[        U[        5      (       a  XPl        Uc%  [        5       U l
        [
        R                  S5        OA[        U[        5      (       a  [        S0 UD6U l
        O[        U[        5      (       a  X@l
        [        U[        5      (       a#  SU;   a  US   OSUS'   [        US      " S0 UD6nO(Uc%  [
        R                  S5        [        S   " SSSS	S
9nX`l        U R                  R                  U R                  R                  :w  a_  U R                  R                  U R                  l        U R                  R                  U R                  l        [
        R                  S5        [         TU ]D  " S0 UDSU0D6  g )Nz7perciver_config is None, using default perceiver configz2vision_config is None, using default vision configr.   mistralz.text_config is None, using default text configi   gh㈵>    F)max_position_embeddingsr8   pad_token_idtie_word_embeddingszPerceiver config has a different `hidden_size` than text config, which means default values were used. In your model's config on the hub, add `hidden_size` and `rms_norm_eps` keys under the `perceiver_config` dict. rP   r   )image_token_id	use_cacherP   r4   rJ   loggerinfo
isinstancedictr	   r   r   rI   r   r8   warning_oncer   r   )	r   rR   rQ   rP   r   rJ   rI   r   r   s	           r   r   Idefics2Config.__init__   s    -"#6 #$;$=D!KKQR($//$;$O>N$OD!(*ABB$4! !5!7DKKLMt,,!5!F!FD';<<!.k4((EQU`E`L(AfoK%(\)BCRkRK KKHI(3(0!$)K '''4+@+@+L+LL040@0@0L0LD!!-151A1A1N1ND!!.C
 	K6K7JKr!   )rQ   rJ   rI   rP   rR   r   )Ti}  FNNN)r)   r*   r+   r,   r-   r.   r   r4   r	   sub_configsr   r0   r1   r2   s   @r   rF   rF      s@     D J!3-K !4L 4Lr!   rF   N)r-   configuration_utilsr   utilsr   autor   r   
get_loggerr)   rS   r	   r4   rF   __all__r   r!   r   <module>r_      s_    # 3  - 
		H	%R3+ R3j;#. ;#|^L% ^LB 
r!   