
    fTh                         S r SSKJr  SSKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  \R                  " \5      r " S	 S
\5      r " S S\
5      rS
S/rg)zMobileViT model configuration    OrderedDict)Mapping)version   )PretrainedConfig)
OnnxConfig)loggingc                   j   ^  \ rS rSrSrSrSSS/ SQ/ SQS	S
SSSSSSSSSSS/ SQSS4U 4S jjrSrU =r$ )MobileViTConfig   a  
This is the configuration class to store the configuration of a [`MobileViTModel`]. It is used to instantiate a
MobileViT model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MobileViT
[apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    image_size (`int`, *optional*, defaults to 256):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 2):
        The size (resolution) of each patch.
    hidden_sizes (`List[int]`, *optional*, defaults to `[144, 192, 240]`):
        Dimensionality (hidden size) of the Transformer encoders at each stage.
    neck_hidden_sizes (`List[int]`, *optional*, defaults to `[16, 32, 64, 96, 128, 160, 640]`):
        The number of channels for the feature maps of the backbone.
    num_attention_heads (`int`, *optional*, defaults to 4):
        Number of attention heads for each attention layer in the Transformer encoder.
    mlp_ratio (`float`, *optional*, defaults to 2.0):
        The ratio of the number of channels in the output of the MLP to the number of channels in the input.
    expand_ratio (`float`, *optional*, defaults to 4.0):
        Expansion factor for the MobileNetv2 layers.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
    conv_kernel_size (`int`, *optional*, defaults to 3):
        The size of the convolutional kernel in the MobileViT layer.
    output_stride (`int`, *optional*, defaults to 32):
        The ratio of the spatial resolution of the output to the resolution of the input image.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the Transformer encoder.
    attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for attached classifiers.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    qkv_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys and values.
    aspp_out_channels (`int`, *optional*, defaults to 256):
        Number of output channels used in the ASPP layer for semantic segmentation.
    atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
        Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
    aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the ASPP layer for semantic segmentation.
    semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
        The index that is ignored by the loss function of the semantic segmentation model.

Example:

```python
>>> from transformers import MobileViTConfig, MobileViTModel

>>> # Initializing a mobilevit-small style configuration
>>> configuration = MobileViTConfig()

>>> # Initializing a model from the mobilevit-small style configuration
>>> model = MobileViTModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```	mobilevitr         )         )       @   `         i     g       @g      @silur   g?g        g{Gz?gh㈵>T)            c                 .  > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        UU l        g )N )super__init__num_channels
image_size
patch_sizehidden_sizesneck_hidden_sizesnum_attention_heads	mlp_ratioexpand_ratio
hidden_actconv_kernel_sizeoutput_stridehidden_dropout_probattention_probs_dropout_probclassifier_dropout_probinitializer_rangelayer_norm_epsqkv_biasaspp_out_channelsatrous_ratesaspp_dropout_probsemantic_loss_ignore_index)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   kwargs	__class__s                          m/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mobilevit/configuration_mobilevit.pyr#   MobileViTConfig.__init__e   s    2 	"6"($$(!2#6 "($ 0*#6 ,H)'>$!2,  "3(!2*D'    )r7   r5   r6   r0   r1   r-   r+   r,   r/   r'   r%   r2   r3   r*   r(   r)   r$   r.   r&   r4   r8   )	__name__
__module____qualname____firstlineno____doc__
model_typer#   __static_attributes____classcell__)r;   s   @r<   r   r      s^    BH J $9%( # #&-1E 1Er>   r   c                       \ rS rSr\R
                  " S5      r\S\\	\\
\	4   4   4S j5       r\S\\	\\
\	4   4   4S j5       r\S\4S j5       rSrg)	MobileViTOnnxConfig   z1.11returnc                 (    [        SSSSSS.4/5      $ )Npixel_valuesbatchr$   heightwidth)r      r   r   r   r9   s    r<   inputsMobileViTOnnxConfig.inputs   s     ^^PX]d-efghhr>   c                 l    U R                   S:X  a  [        SSS04/5      $ [        SSS04SSS04/5      $ )Nzimage-classificationlogitsr   rM   last_hidden_statepooler_output)taskr   rQ   s    r<   outputsMobileViTOnnxConfig.outputs   sL    99..Aw< 89::!4q'l CoXY[bWcEdeffr>   c                     g)Ng-C6?r!   rQ   s    r<   atol_for_validation'MobileViTOnnxConfig.atol_for_validation   s    r>   r!   N)r?   r@   rA   rB   r   parsetorch_onnx_minimum_versionpropertyr   strintrR   rY   floatr\   rE   r!   r>   r<   rH   rH      s    !(v!6iWS#X%6 67 i i ggc3h&7!78 g g U  r>   rH   N)rC   collectionsr   typingr   	packagingr   configuration_utilsr   onnxr	   utilsr
   
get_loggerr?   loggerr   rH   __all__r!   r>   r<   <module>rm      sY    $ #   3   
		H	%xE& xEv* & 3
4r>   