
    fTh!                         S r SSKJr  SSKJr  SSKJrJr  \R                  " \	5      r
 " S S\5      r " S S	\5      rS	S/rg
)zIdefics3 model configuration   )PretrainedConfig)logging   )CONFIG_MAPPING
AutoConfigc                   N   ^  \ rS rSrSrSrSr           SU 4S jjrSrU =r	$ )Idefics3VisionConfig   a  
This is the configuration class to store the configuration of a [`Idefics3VisionModel`]. It is used to instantiate a
Idefics3 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics3 model
[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1152):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

Example:

```python
>>> from transformers.models.idefics3.modeling_idefics3 import Idefics3VisionTransformer
>>> from transformers.models.idefics3.configuration_idefics3 import Idefics3VisionConfig

>>> # Initializing a Idefics3VisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = Idefics3VisionConfig()

>>> # Initializing a Idefics3VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = Idefics3VisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```idefics3_visionvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        Xl        g )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_actinitializer_range)selfr   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics3/configuration_idefics3.pyr   Idefics3VisionConfig.__init__O   sT     	"6"&!2!2#6 ($$!2,$!2    )r   r   r   r   r   r   r   r   r   r   r   )i  i         r          gelu_pytorch_tanhgư>g        g{Gz?)
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r   s   @r   r	   r	      sB    1f #J%O &3 3r!   r	   c                   L   ^  \ rS rSrSrSr\\S.r       SU 4S jjr	Sr
U =r$ )Idefics3Configm   aB  
This is the configuration class to store the configuration of a [`Idefics3Model`]. It is used to instantiate a
Idefics3 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the Idefics3
[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should cache the key/value pairs of the attention mechanism. Only
        relevant if `config.is_decoder=True`.
    image_token_id (`int`, *optional*, defaults to 128257):
        The id of the "image" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to tie the word embeddings with the token embeddings.
    vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
        Custom vision config or dict for the vision tower
    text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
        Custom text config or dict for the text model
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.
    pad_token_id (`int`, *optional*, defaults to 128002):
        The id of the padding token.

Example:
```python
>>> from transformers import Idefics3Model, Idefics3Config
>>> # Initializing configuration
>>> configuration = Idefics3Config()
>>> # Initializing a model from the configuration
>>> model = Idefics3Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```idefics3)text_configr   c                   > X l         Xl        X0l        Uc%  [        5       U l        [
        R                  S5        OA[        U[        5      (       a  [        S	0 UD6U l        O[        U[        5      (       a  X@l        [        U[        5      (       a#  SU;   a  US   OSUS'   [        US      " S	0 UD6nO'Uc$  [
        R                  S5        [        S   " SUSS9nXPl
        X`l        [        T	U ]4  " S	0 UDXsS.D6  g )
Nz2vision_config is None, using default vision configr,   llamaz.text_config is None, using default text configgh㈵>F)rms_norm_epspad_token_idtie_word_embeddings)r9   r:   r   )image_token_id	use_cacher:   r	   r   loggerinfo
isinstancedictr   r5   scale_factorr   r   )
r   r<   r;   r:   r   r5   rA   r9   r   r   s
            r   r   Idefics3Config.__init__   s     -"#6  !5!7DKKLMt,,!5!F!FD';<<!.k4((EQU`E`L(AfmK%(\)BCRkRK KKHI(1!)$)K '(f6ffr!   )r;   rA   r5   r:   r<   r   )Ti FNNr   i )r'   r(   r)   r*   r+   r,   r   r	   sub_configsr   r.   r/   r0   s   @r   r2   r2   m   s>    #J J",?STK !%g %gr!   r2   N)r+   configuration_utilsr   utilsr   autor   r   
get_loggerr'   r=   r	   r2   __all__r   r!   r   <module>rI      sT    # 3  - 
		H	%R3+ R3jNg% Ngb 3
4r!   