
    fThK                         S r SSKJr  SSKJrJrJrJr  \(       a  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	K
Jr  \R"                  " \5      r " S
 S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)zCLIP model configuration    OrderedDict)TYPE_CHECKINGAnyMappingOptional   )ProcessorMixin)
TensorType)PretrainedConfig)
OnnxConfig)loggingc                   V   ^  \ rS rSrSrSrSr               SU 4S jjrSrU =r	$ )CLIPTextConfig!   a  
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`CLIPModel`].
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    pad_token_id (`int`, *optional*, defaults to 1):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 49406):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 49407):
        End of stream token id.

Example:

```python
>>> from transformers import CLIPTextConfig, CLIPTextModel

>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPTextConfig()

>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```clip_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizeprojection_dimnum_hidden_layersnum_attention_headsmax_position_embeddingslayer_norm_eps
hidden_actinitializer_rangeinitializer_factorattention_dropout)selfr   r   r   r   r   r    r!   r#   r"   r&   r$   r%   r   r   r   kwargs	__class__s                    c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/clip/configuration_clip.pyr   CLIPTextConfig.__init__a   sd    * 	sl\hslrs$&!2,!2#6 '>$,$!2"4!2    )r&   r#   r   r%   r$   r   r"   r!   r    r   r   r   )i      i   r-         M   
quick_geluh㈵>        {Gz?      ?   i  i  
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r)   s   @r*   r   r   !   sP    :x #J#O  " %"3 "3r,   r   c                   R   ^  \ rS rSrSrSrSr             SU 4S jjrSrU =r	$ )CLIPVisionConfig   aB  
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import CLIPVisionConfig, CLIPVisionModel

>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPVisionConfig()

>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```clip_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xl        Xpl	        Xl
        Xl        Xl        Xl        Xl        g )Nr   )r   r   r   r   r   r   r    num_channels
patch_size
image_sizer$   r%   r&   r"   r#   )r'   r   r   r   r   r    rH   rJ   rI   r#   r"   r&   r$   r%   r(   r)   s                  r*   r   CLIPVisionConfig.__init__   s`    " 	"6"&!2,!2#6 ($$!2"4!2,$r,   )r&   r#   r   rJ   r%   r$   r   r"   r    rH   r   rI   r   )i   i   r-   r.   r.   r	          r1   r2   r3   r4   r5   r7   rA   s   @r*   rC   rC      sH    4l %J%O % %r,   rC   c                   \   ^  \ rS rSrSrSr\\S.r S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )
CLIPConfig   a  
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
a configuration with the defaults will yield a similar configuration to that of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`CLIPTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import CLIPConfig, CLIPModel

>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPConfig()

>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
>>> from transformers import CLIPTextConfig, CLIPVisionConfig

>>> # Initializing a CLIPText and CLIPVision configuration
>>> config_text = CLIPTextConfig()
>>> config_vision = CLIPVisionConfig()

>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
```clipr   rF   c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " S0 UD6  Ub  Uc  0 n[        S0 UD6R	                  5       nUR                  5        HH  u  pX;   d  M  XU	   :w  d  M  U	S;  d  M  X;   a
  SU	 SU	 S3nOSU	 S3n[        R                  U5        MJ     UR                  U5        Ub  Uc  0 n[        S0 UD6R	                  5       nS	U;   a5  US	   R                  5        V	V
s0 s H  u  p[        U	5      U
_M     sn
n	US	'   UR                  5        HH  u  pX;   d  M  XU	   :w  d  M  U	S;  d  M  X;   a
  SU	 S
U	 S3nOSU	 S3n[        R                  U5        MJ     UR                  U5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [        S0 UD6U l        [        S0 UD6U l        X0l        X@l        SU l        g s  sn
n	f )Ntext_config_dictvision_config_dict)transformers_version`zp` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["z"]` will be used instead.zj`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["z"]` will be overridden.id2labelzv` is found in both `vision_config_dict` and `vision_config` but with different values. The value `vision_config_dict["zp`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. The value `vision_config["zO`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.zS`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.r5   r   )popr   r   r   to_dictitemsloggerinfoupdaterC   strr   rF   r   logit_scale_init_valuer%   )r'   r   rF   r   r`   r(   rT   rU   _text_config_dictkeyvaluemessage_vision_config_dictr)   s                r*   r   CLIPConfig.__init__  sS    "::&8$?#ZZ(<dC"6"
 '"  !/ B1A B J J L 0557
%%s3C*CSkHk.u %<<?5@Y[  336%7NP   KK( 8" 01)$ " #3"H5G"H"P"P"R006I*6U6[6[6]36]
CHeO6]3#J/
 2779
'E35G,GCWoLo0u %FFIUJce  99<=TV   KK( :"   !45KKKij MKKmn)8K8->>,&<#"%K3s   G5r   rF   c                 P    U " SUR                  5       UR                  5       S.UD6$ )z
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
configuration.

Returns:
    [`CLIPConfig`]: An instance of a configuration object
rR   r   )rZ   )clsr   rF   r(   s       r*   from_text_vision_configs#CLIPConfig.from_text_vision_configsk  s,     f{224MDYDYD[f_effr,   )r%   r`   r   r   rF   )NNr-   g/L
F@)r8   r9   r:   r;   r<   r=   r   rC   sub_configsr   classmethodri   r?   r@   rA   s   @r*   rO   rO      sP    +Z J"0CSTK `fV&p 	g> 	gRb 	g 	gr,   rO   c                      ^  \ rS rSr\S\\\\\4   4   4S j5       r\S\\\\\4   4   4S j5       r	\S\
4S j5       r   SSSS\S	\S
\S   S\\\4   4
U 4S jjjr\S\4S j5       rSrU =r$ )CLIPOnnxConfigix  returnc           	      @    [        SSSS.4SSSSSS	.4S
SSS.4/5      $ )N	input_idsbatchsequence)r   r6   pixel_valuesrH   heightwidth)r   r6      r	   attention_maskr   r'   s    r*   inputsCLIPOnnxConfig.inputsy  s@    'j9:WHQX!YZ!w:#>?
 	
r,   c                 @    [        SSS04SSS04SSS04SSS04/5      $ )Nlogits_per_imager   rr   logits_per_texttext_embedsimage_embedsr   ry   s    r*   outputsCLIPOnnxConfig.outputs  sD    #a\2"QL1G-!W.	
 	
r,   c                     g)Ng-C6?r   ry   s    r*   atol_for_validation"CLIPOnnxConfig.atol_for_validation  s    r,   	processorr
   
batch_size
seq_length	frameworkr   c                 r   > [         TU ]  UR                  X#US9n[         TU ]  UR                  X$S9n0 UEUE$ )N)r   r   r   )r   r   )r   generate_dummy_inputs	tokenizerimage_processor)r'   r   r   r   r   text_input_dictimage_input_dictr)   s          r*   r   $CLIPOnnxConfig.generate_dummy_inputs  s\      '7JYb 8 
 !78%%* 9 
 7/6%566r,   c                     g)N   r   ry   s    r*   default_onnx_opset!CLIPOnnxConfig.default_onnx_opset  s    r,   r   )r   N)r8   r9   r:   r;   propertyr   r_   intrz   r   floatr   r   r   r   r   r?   r@   rA   s   @r*   rn   rn   x  s    
WS#X%6 67 
 
 
gc3h&7!78 
 
 U   ,07#7 7 	7
 L)7 
c	7 7 C  r,   rn   )rO   rn   r   rC   N)r<   collectionsr   typingr   r   r   r   processing_utilsr
   utilsr   configuration_utilsr   onnxr   r   
get_loggerr8   r\   r   rC   rO   rn   __all__r   r,   r*   <module>r      s     # 8 8 2# 3   
		H	%b3% b3JY%' Y%xSg! Sgl+Z +\ Qr,   