
    fTh/                         S r SSKJr  SSKJr  \R
                  " \5      r " S S\5      r " S S\5      r	 " S S	\5      r
/ S
Qrg)zSiglip model configuration   )PretrainedConfig)loggingc                   R   ^  \ rS rSrSrSrSr             SU 4S jjrSrU =r	$ )SiglipTextConfig   a  
This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 32000):
        Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`SiglipModel`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 64):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    projection_size (`int`, *optional*, defaults to `hidden_size`):
        The size of the projection head.

Example:

```python
>>> from transformers import SiglipTextConfig, SiglipTextModel

>>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipTextConfig()

>>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```siglip_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xl        Xpl	        Xl
        Ub  Xl        g UU l        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsmax_position_embeddingslayer_norm_eps
hidden_actattention_dropoutprojection_size)selfr   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                  g/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/siglip/configuration_siglip.pyr   SiglipTextConfig.__init__S   se    & 	sl\hslrs$&!2!2#6 '>$,$!22A2MS^    )
r   r   r   r   r   r   r   r   r   r   )i }           r#   @   gelu_pytorch_tanhư>           i  i  N
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r   s   @r   r   r      sL    5n %J#O  "& !_ _r    r   c                   L   ^  \ rS rSrSrSrSr          SU 4S jjrSrU =r	$ )SiglipVisionConfigt   a	  
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.

Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipVisionConfig()

>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```siglip_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        g )Nr   )r   r   r   r   r   r   num_channels
patch_size
image_sizer   r   r   )r   r   r   r   r   r:   r<   r;   r   r   r   r   r   s               r   r   SiglipVisionConfig.__init__   sN     	"6"&!2!2#6 ($$!2,$r    )
r   r   r   r<   r   r   r   r:   r   r;   )
r!   r"   r#   r#   r         r%   r&   r'   r)   r3   s   @r   r5   r5   t   s?    -^ 'J%O &% %r    r5   c                   Z   ^  \ rS rSrSrSr\\S.rS
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )SiglipConfig   a  
[`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`SiglipTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import SiglipConfig, SiglipModel

>>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipConfig()

>>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
>>> from transformers import SiglipTextConfig, SiglipVisionConfig

>>> # Initializing a SiglipText and SiglipVision configuration
>>> config_text = SiglipTextConfig()
>>> config_vision = SiglipVisionConfig()

>>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
```siglipr	   r8   c                    > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        SU l        g )NzQ`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.zU`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.g      ?r   )	r   r   loggerinfor   r	   r5   r8   initializer_factor)r   r	   r8   r   r   s       r   r   SiglipConfig.__init__   sk    "6"KKKkl MKKop+:k:/@-@"%r    r	   r8   c                 P    U " SUR                  5       UR                  5       S.UD6$ )z
Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
model configuration.

Returns:
    [`SiglipConfig`]: An instance of a configuration object
rD   r   )to_dict)clsr	   r8   r   s       r   from_text_vision_configs%SiglipConfig.from_text_vision_configs   s,     f{224MDYDYD[f_effr    )rH   r	   r8   )NN)r*   r+   r,   r-   r.   r/   r   r5   sub_configsr   classmethodrM   r1   r2   r3   s   @r   rA   rA      sH    'R J"2EWXK&  	g3C 	gTf 	g 	gr    rA   )rA   r   r5   N)r.   configuration_utilsr   utilsr   
get_loggerr*   rF   r   r5   rA   __all__r   r    r   <module>rU      s^    ! 3  
		H	%Y_' Y_xL%) L%^Gg# GgT Er    