
    fThJ                         S r SSKJr  SSKJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJr  \(       a  SSKJr  SS	KJr  \R"                  " \5      r " S
 S\	5      r " S S\	5      r " S S\	5      r " S S\5      r/ SQrg)zGroupViT model configuration    OrderedDict)TYPE_CHECKINGAnyMappingOptional   )PretrainedConfig)
OnnxConfig)logging)ProcessorMixin)
TensorTypec                   V   ^  \ rS rSrSrSrSr               SU 4S jjrSrU =r	$ )GroupViTTextConfig!   a  
This is the configuration class to store the configuration of a [`GroupViTTextModel`]. It is used to instantiate an
GroupViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the GroupViT
[nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the GroupViT text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`GroupViTModel`].
    hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 4):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-5):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    dropout (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import GroupViTTextConfig, GroupViTTextModel

>>> # Initializing a GroupViTTextModel with nvidia/groupvit-gcc-yfcc style configuration
>>> configuration = GroupViTTextConfig()

>>> model = GroupViTTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```groupvit_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        Xl        X@l        XPl        X`l        Xl	        Xpl
        Xl        Xl        Xl        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizedropoutnum_hidden_layersnum_attention_headsmax_position_embeddingslayer_norm_eps
hidden_actinitializer_rangeinitializer_factorattention_dropout)selfr   r   r   r   r    r!   r#   r"   r   r&   r$   r%   r   r   r   kwargs	__class__s                    k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/groupvit/configuration_groupvit.pyr   GroupViTTextConfig.__init__Z   sc    & 	sl\hslrs$&!2!2#6 '>$,$!2"4!2    )r&   r   r#   r   r%   r$   r   r"   r!   r    r   r   )i      i         M   
quick_geluh㈵>        r3   {Gz?      ?   i  i  
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r)   s   @r*   r   r   !   sN    3j 'J#O  "! 3  3r,   r   c                   l   ^  \ rS rSrSrSrSrSS/ SQS/ S	Q/ S
QSSSSSSSSSSSSS/4U 4S jjrSrU =r	$ )GroupViTVisionConfig}   a  
This is the configuration class to store the configuration of a [`GroupViTVisionModel`]. It is used to instantiate
an GroupViT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the GroupViT
[nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 384):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 1536):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    depths (`List[int]`, *optional*, defaults to [6, 3, 3]):
        The number of layers in each encoder block.
    num_group_tokens (`List[int]`, *optional*, defaults to [64, 8, 0]):
        The number of group tokens for each stage.
    num_output_groups (`List[int]`, *optional*, defaults to [64, 8, 8]):
        The number of output groups for each stage, 0 means no group.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-5):
        The epsilon used by the layer normalization layers.
    dropout (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import GroupViTVisionConfig, GroupViTVisionModel

>>> # Initializing a GroupViTVisionModel with nvidia/groupvit-gcc-yfcc style configuration
>>> configuration = GroupViTVisionConfig()

>>> model = GroupViTVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```groupvit_vision_modelvision_configi  i   )   r	   r	   r.   )@      r   )rH   rI   rI   rG         r	   gelur2   r3   r4   r5   g      ?r/   c                 j  > [         TU ]  " S0 UD6  Xl        X l        X0l        U[        U5      :w  a$  [        R                  SU S[        U5       35        X@l        XPl	        X`l
        Xpl        Xl        Xl        Xl        Xl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        g )Nz&Manually setting num_hidden_layers to z1, but we expect num_hidden_layers = sum(depth) = r   )r   r   r   r   depthssumloggerwarningr   num_group_tokensnum_output_groupsr    
image_size
patch_sizenum_channelsr#   r"   r   r&   r$   r%   
assign_epsassign_mlp_ratio)r'   r   r   rN   r   rR   rS   r    rT   rU   rV   r#   r"   r   r&   r$   r%   rW   rX   r(   r)   s                       r*   r   GroupViTVisionConfig.__init__   s    , 	"6"&!2F+NN89J8K L!!$V/ "3 0!2#6 $$($,!2!2"4$ 0r,   )rW   rX   r&   rN   r   r#   r   rT   r%   r$   r   r"   r    rV   rR   r   rS   rU   r7   rA   s   @r*   rC   rC   }   s[    5n )J%O #$q'.1 .1r,   rC   c                   d   ^  \ rS rSrSrSr\\S.r     S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )GroupViTConfig   at  
[`GroupViTConfig`] is the configuration class to store the configuration of a [`GroupViTModel`]. It is used to
instantiate a GroupViT model according to the specified arguments, defining the text model and vision model
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the GroupViT
[nvidia/groupvit-gcc-yfcc](https://huggingface.co/nvidia/groupvit-gcc-yfcc) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`GroupViTTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`GroupViTVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 256):
        Dimensionality of text and vision projection layers.
    projection_intermediate_dim (`int`, *optional*, defaults to 4096):
        Dimensionality of intermediate layer of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original GroupViT
        implementation.
    kwargs (*optional*):
        Dictionary of keyword arguments.
groupvitr   rF   c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " S0 UD6  Ub  Uc  0 n[        S0 UD6R	                  5       n	U	R                  5        HH  u  pX;   d  M  XU
   :w  d  M  U
S;  d  M  X;   a
  SU
 SU
 S3nOSU
 S3n[        R                  U5        MJ     UR                  U	5        Ub  Uc  0 n[        S0 UD6R	                  5       nS	U;   a5  US	   R                  5        V
Vs0 s H  u  p[        U
5      U_M     snn
US	'   UR                  5        HH  u  pX;   d  M  XU
   :w  d  M  U
S;  d  M  X;   a
  SU
 S
U
 S3nOSU
 S3n[        R                  U5        MJ     UR                  U5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [        S0 UD6U l        [        S0 UD6U l        X0l        X@l        XPl        SU l        SU l        SU l        g s  snn
f )Ntext_config_dictvision_config_dict)transformers_version`zp` is found in both `text_config_dict` and `text_config` but with different values. The value `text_config_dict["z"]` will be used instead.zn`text_config_dict` is provided which will be used to initialize `GroupViTTextConfig`. The value `text_config["z"]` will be overridden.id2labelzv` is found in both `vision_config_dict` and `vision_config` but with different values. The value `vision_config_dict["zt`vision_config_dict` is provided which will be used to initialize `GroupViTVisionConfig`. The value `vision_config["zS`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.zW`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.r4   r5   Fr   )popr   r   r   to_dictitemsrP   infoupdaterC   strr   rF   projection_dimprojection_intermediate_dimlogit_scale_init_valuer$   r%   output_segmentation)r'   r   rF   rk   rl   rm   r(   r`   ra   _text_config_dictkeyvaluemessage_vision_config_dictr)   s                 r*   r   GroupViTConfig.__init__  si    "::&8$?#ZZ(<dC"6"
 '"  !3 F5E F N N P 0557
%%s3C*CSkHk.u %<<?5@Y[  77:e;RT   KK( 8" 01)$ " #7"L9K"L"T"T"V006I*6U6[6[6]36]
CHeO6]3#J/
 2779
'E35G,GCWoLo0u %FFIUJce  ::=>UW   KK( :"   !45KKKmn MKKqr-<<1BMB,+F(&<#!%"%#( Q3s   H	r   rF   c                 P    U " SUR                  5       UR                  5       S.UD6$ )z
Instantiate a [`GroupViTConfig`] (or a derived class) from groupvit text model configuration and groupvit
vision model configuration.

Returns:
    [`GroupViTConfig`]: An instance of a configuration object
r^   r   )rf   )clsr   rF   r(   s       r*   from_text_vision_configs'GroupViTConfig.from_text_vision_configsg  s,     f{224MDYDYD[f_effr,   )r%   r$   rm   rn   rk   rl   r   rF   )NNr-   i   g/L
F@)r8   r9   r:   r;   r<   r=   r   rC   sub_configsr   classmethodrw   r?   r@   rA   s   @r*   r[   r[      sZ    2 J"4G[\K $(%_)B 	g3E 	gVj 	g 	gr,   r[   c                      ^  \ rS rSr\S\\\\\4   4   4S j5       r\S\\\\\4   4   4S j5       r	\S\
4S j5       r   SSSS\S	\S
\S   S\\\4   4
U 4S jjjr\S\4S j5       rSrU =r$ )GroupViTOnnxConfigit  returnc           	      @    [        SSSS.4SSSSSS	.4S
SSS.4/5      $ )N	input_idsbatchsequence)r   r6   pixel_valuesrV   heightwidth)r   r6      r	   attention_maskr   r'   s    r*   inputsGroupViTOnnxConfig.inputsu  s@    'j9:WHQX!YZ!w:#>?
 	
r,   c                 @    [        SSS04SSS04SSS04SSS04/5      $ )Nlogits_per_imager   r   logits_per_texttext_embedsimage_embedsr   r   s    r*   outputsGroupViTOnnxConfig.outputs  sD    #a\2"QL1G-!W.	
 	
r,   c                     g)Ng-C6?r   r   s    r*   atol_for_validation&GroupViTOnnxConfig.atol_for_validation  s    r,   	processorr   
batch_size
seq_length	frameworkr   c                 r   > [         TU ]  UR                  X#US9n[         TU ]  UR                  X$S9n0 UEUE$ )N)r   r   r   )r   r   )r   generate_dummy_inputs	tokenizerimage_processor)r'   r   r   r   r   text_input_dictimage_input_dictr)   s          r*   r   (GroupViTOnnxConfig.generate_dummy_inputs  s\      '7JYb 8 
 !78%%* 9 
 7/6%566r,   c                     g)N   r   r   s    r*   default_onnx_opset%GroupViTOnnxConfig.default_onnx_opset  s    r,   r   )r   N)r8   r9   r:   r;   propertyr   rj   intr   r   floatr   r   r   r   r   r?   r@   rA   s   @r*   r|   r|   t  s    
WS#X%6 67 
 
 
gc3h&7!78 
 
 U   ,07#7 7 	7
 L)7 
c	7 7 C  r,   r|   )r[   r|   r   rC   N)r<   collectionsr   typingr   r   r   r   configuration_utilsr
   onnxr   utilsr   processing_utilsr   r   
get_loggerr8   rP   r   rC   r[   r|   __all__r   r,   r*   <module>r      s    # # 8 8 3   2# 
		H	%Y3) Y3xi1+ i1XHg% HgV+ +\ ar,   