
    fThN8                         S r SSKJr  SSKJrJrJrJrJr  \(       a  SSK	J
r
  SSKJr  SSKJr  SSKJr  SS	KJr  \R$                  " \5      r " S
 S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)zOWL-ViT model configuration    OrderedDict)TYPE_CHECKINGAnyDictMappingOptional   )ProcessorMixin)
TensorType)PretrainedConfig)
OnnxConfig)loggingc                   T   ^  \ rS rSrSrSrSr              SU 4S jjrSrU =r	$ )OwlViTTextConfig!   a  
This is the configuration class to store the configuration of an [`OwlViTTextModel`]. It is used to instantiate an
OwlViT text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OwlViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`OwlViTTextModel`].
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 16):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the padding token in the input sequences.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the input sequences.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the input sequences.

Example:

```python
>>> from transformers import OwlViTTextConfig, OwlViTTextModel

>>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTTextConfig()

>>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlvit_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsmax_position_embeddings
hidden_actlayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r   r   r   kwargs	__class__s                   g/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/owlvit/configuration_owlvit.pyr   OwlViTTextConfig.__init__`   s^    $ 	sl\hslrs$&!2!2#6 '>$$,!2!2"4    )r$   r"   r   r&   r%   r   r#   r!   r    r   r   )i      i            
quick_geluh㈵>        {Gz?      ?r   i  i  
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r)   s   @r*   r   r   !   sK    9v %J#O  "5 5r,   r   c                   P   ^  \ rS rSrSrSrSr            SU 4S jjrSrU =r	$ )OwlViTVisionConfig   a
  
This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate
an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 768):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import OwlViTVisionConfig, OwlViTVisionModel

>>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTVisionConfig()

>>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlvit_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        g )Nr   )r   r   r   r   r   r    num_channels
image_size
patch_sizer"   r#   r$   r%   r&   )r'   r   r   r   r    rG   rH   rI   r"   r#   r$   r%   r&   r(   r)   s                 r*   r   OwlViTVisionConfig.__init__   sZ      	"6"&!2!2#6 ($$$,!2!2"4r,   )r$   r"   r   rH   r&   r%   r   r#   r    rG   r   rI   )   i   r.   r.   r
   rK       r1   r2   r3   r4   r5   r6   r@   s   @r*   rB   rB      sE    2h 'J%O 5 5r,   rB   c                   d   ^  \ rS rSrSrSr\\S.r     S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )OwlViTConfig   ai  
[`OwlViTConfig`] is the configuration class to store the configuration of an [`OwlViTModel`]. It is used to
instantiate an OWL-ViT model according to the specified arguments, defining the text model and vision model
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
        implementation.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return a dictionary. If `False`, returns a tuple.
    kwargs (*optional*):
        Dictionary of keyword arguments.
owlvit)r   rE   c                    > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        XPl
        SU l        g )NzKtext_config is None. Initializing the OwlViTTextConfig with default values.zOvision_config is None. initializing the OwlViTVisionConfig with default values.r5   r   )r   r   loggerinfor   r   rB   rE   projection_dimlogit_scale_init_valuereturn_dictr&   )r'   r   rE   rT   rU   rV   r(   r)   s          r*   r   OwlViTConfig.__init__   s     	"6"KKKef MKKij+:k:/@-@,&<#&"%r,   r   rE   c                 <    0 nXS'   X$S'   U R                   " U40 UD6$ )z
Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision
model configuration.

Returns:
    [`OwlViTConfig`]: An instance of a configuration object
r   rE   )	from_dict)clsr   rE   r(   config_dicts        r*   from_text_vision_configs%OwlViTConfig.from_text_vision_configs  s-     %0M"'4O$}}[3F33r,   )r&   rU   rT   rV   r   rE   )NNr-   g/L
F@T)r7   r8   r9   r:   r;   r<   r   rB   sub_configsr   classmethodr   r\   r>   r?   r@   s   @r*   rN   rN      sR    2 J"2EWXK %&6 44 4 4 4r,   rN   c                      ^  \ rS rSr\S\\\\\4   4   4S j5       r\S\\\\\4   4   4S j5       r	\S\
4S j5       r   SSSS\S	\S
\S   S\\\4   4
U 4S jjjr\S\4S j5       rSrU =r$ )OwlViTOnnxConfigi!  returnc           	      @    [        SSSS.4SSSSSS	.4S
SSS.4/5      $ )N	input_idsbatchsequence)r      pixel_valuesrG   heightwidth)r   rg      r
   attention_maskr   r'   s    r*   inputsOwlViTOnnxConfig.inputs"  s@    'j9:WHQX!YZ!w:#>?
 	
r,   c                 @    [        SSS04SSS04SSS04SSS04/5      $ )Nlogits_per_imager   re   logits_per_texttext_embedsimage_embedsr   rm   s    r*   outputsOwlViTOnnxConfig.outputs,  sD    #a\2"QL1G-!W.	
 	
r,   c                     g)Ng-C6?r   rm   s    r*   atol_for_validation$OwlViTOnnxConfig.atol_for_validation7  s    r,   	processorr   
batch_size
seq_length	frameworkr   c                 r   > [         TU ]  UR                  X#US9n[         TU ]  UR                  X$S9n0 UEUE$ )N)r{   r|   r}   )r{   r}   )r   generate_dummy_inputs	tokenizerimage_processor)r'   rz   r{   r|   r}   text_input_dictimage_input_dictr)   s          r*   r   &OwlViTOnnxConfig.generate_dummy_inputs;  s\      '7JYb 8 
 !78%%* 9 
 7/6%566r,   c                     g)N   r   rm   s    r*   default_onnx_opset#OwlViTOnnxConfig.default_onnx_opsetJ  s    r,   r   )r   N)r7   r8   r9   r:   propertyr   strintrn   ru   floatrx   r	   r   r   r   r>   r?   r@   s   @r*   ra   ra   !  s    
WS#X%6 67 
 
 
gc3h&7!78 
 
 U   ,07#7 7 	7
 L)7 
c	7 7 C  r,   ra   )rN   ra   r   rB   N)r;   collectionsr   typingr   r   r   r   r	   processing_utilsr   utilsr   configuration_utilsr   onnxr   r   
get_loggerr7   rR   r   rB   rN   ra   __all__r   r,   r*   <module>r      s~    " # > > 2# 3   
		H	%]5' ]5@U5) U5pE4# E4P+z +\ Yr,   