
    fTh3                         S r SSKJrJr  \(       a   SSKJr  SSKJr  \R                  " \	5      r
 " S S\5      r " S S	\5      r " S
 S\5      r/ SQrg)zOWLv2 model configuration    )TYPE_CHECKINGDict   )PretrainedConfig)loggingc                   T   ^  \ rS rSrSrSrSr              SU 4S jjrSrU =r	$ )Owlv2TextConfig   a  
This is the configuration class to store the configuration of an [`Owlv2TextModel`]. It is used to instantiate an
Owlv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Owlv2
[google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the OWLv2 text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`Owlv2TextModel`].
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 16):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the padding token in the input sequences.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the input sequences.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the input sequences.

Example:

```python
>>> from transformers import Owlv2TextConfig, Owlv2TextModel

>>> # Initializing a Owlv2TextModel with google/owlv2-base-patch16 style configuration
>>> configuration = Owlv2TextConfig()

>>> # Initializing a Owlv2TextConfig from the google/owlv2-base-patch16 style configuration
>>> model = Owlv2TextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlv2_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsmax_position_embeddings
hidden_actlayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                   e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/owlv2/configuration_owlv2.pyr   Owlv2TextConfig.__init__^   s^    $ 	sl\hslrs$&!2!2#6 '>$$,!2!2"4    )r   r   r   r   r   r   r   r   r   r   r   )i      i            
quick_geluh㈵>        {Gz?      ?r   i  i  
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r!   s   @r"   r	   r	      sK    9v $J#O  "5 5r$   r	   c                   P   ^  \ rS rSrSrSrSr            SU 4S jjrSrU =r	$ )Owlv2VisionConfig   a
  
This is the configuration class to store the configuration of an [`Owlv2VisionModel`]. It is used to instantiate
an OWLv2 image encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OWLv2
[google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 768):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import Owlv2VisionConfig, Owlv2VisionModel

>>> # Initializing a Owlv2VisionModel with google/owlv2-base-patch16 style configuration
>>> configuration = Owlv2VisionConfig()

>>> # Initializing a Owlv2VisionModel model from the google/owlv2-base-patch16 style configuration
>>> model = Owlv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlv2_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        g )Nr   )r   r   r   r   r   r   num_channels
image_size
patch_sizer   r   r   r   r   )r   r   r   r   r   r?   r@   rA   r   r   r   r   r   r    r!   s                 r"   r   Owlv2VisionConfig.__init__   sZ      	"6"&!2!2#6 ($$$,!2!2"4r$   )r   r   r   r@   r   r   r   r   r   r?   r   rA   )   i   r&   r&   r   rC   r(   r)   r*   r+   r,   r-   r.   r8   s   @r"   r:   r:      sE    2h &J%O 5 5r$   r:   c                   d   ^  \ rS rSrSrSr\\S.r     S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )Owlv2Config   a]  
[`Owlv2Config`] is the configuration class to store the configuration of an [`Owlv2Model`]. It is used to
instantiate an OWLv2 model according to the specified arguments, defining the text model and vision model
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWLv2
[google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Owlv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Owlv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original OWLv2
        implementation.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return a dictionary. If `False`, returns a tuple.
    kwargs (*optional*):
        Dictionary of keyword arguments.
owlv2)r   r=   c                    > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        XPl
        SU l        g )NzJtext_config is None. Initializing the Owlv2TextConfig with default values.zNvision_config is None. initializing the Owlv2VisionConfig with default values.r-   r   )r   r   loggerinfor	   r   r:   r=   projection_dimlogit_scale_init_valuereturn_dictr   )r   r   r=   rK   rL   rM   r    r!   s          r"   r   Owlv2Config.__init__   s     	"6"KKKde MKKhi*9[9.??,&<#&"%r$   r   r=   c                 <    0 nXS'   X$S'   U R                   " U40 UD6$ )z
Instantiate a [`Owlv2Config`] (or a derived class) from owlv2 text model configuration and owlv2 vision
model configuration.

Returns:
    [`Owlv2Config`]: An instance of a configuration object
r   r=   )	from_dict)clsr   r=   r    config_dicts        r"   from_text_vision_configs$Owlv2Config.from_text_vision_configs  s-     %0M"'4O$}}[3F33r$   )r   rL   rK   rM   r   r=   )NNr%   g/L
F@T)r/   r0   r1   r2   r3   r4   r	   r:   sub_configsr   classmethodr   rS   r6   r7   r8   s   @r"   rE   rE      sR    2 J"1DUVK %&6 44 4 4 4r$   rE   )rE   r	   r:   N)r3   typingr   r   configuration_utilsr   utilsr   
get_loggerr/   rI   r	   r:   rE   __all__r   r$   r"   <module>r\      se      &  3  
		H	%]5& ]5BU5( U5rE4" E4P Br$   