
    fTh)                     T    S SK Jr  SSKJrJr   " S S\5      r " S S\5      rSS/rg)	   )PretrainedConfig   )CONFIG_MAPPING
AutoConfigc                   h   ^  \ rS rSrSrSrSrSSSSSS	S
SSSSSSSS/SS/SSSSS4U 4S jjrSrU =r	$ )InternVLVisionConfig   a  
This is the configuration class to store the configuration of a [`InternVLVisionModel`]. It is used to instantiate an InternVLVisionModel
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
a similar configuration to that of the InternVL3-1B.
e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    use_qk_norm (`bool`, *optional*, defaults to `False`):
        Whether to apply normalization to the queries and keys before the attention operation.
    intermediate_size (`int`, *optional*, defaults to 4096):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` are supported.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for attention weights.
    projection_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for the projection layer.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    norm_type (`str`, *optional*, defaults to `"layer_norm"`):
        The type of normalization to use in the encoder. Can be `"layer_norm"` or `"rms_norm"`.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    image_size (`int` or `list[int]`, *optional*, defaults to `[448, 448]`):
        The size (resolution) of each image.
    patch_size (`int` or `list[int]`, *optional*, defaults to `[14, 14]`):
        The size (resolution) of each patch.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    use_mask_token (`bool`, *optional*, defaults to `False`):
        Whether to use a mask token for masked image modeling.
    use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to use BERT-style absolute position embeddings.
    layer_scale_init_value (`float`, *optional*, defaults to 0.1):
        Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
    use_mean_pooling (`bool`, *optional*, defaults to `True`):
        Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
        CLS token, before applying the classification head.

Example:

```python
>>> from transformers import InternVLVisionConfig, InternVLVisionModel

>>> # Initializing a InternVLVisionModel OpenGVLab/InternVL3-1B-hf style configuration
>>> configuration = InternVLVisionConfig()

>>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
>>> model = InternVLVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```internvl_visionvision_configi         Fi   gelug        g{Gz?
layer_normgư>i     r   Tg?c                   > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        [        U[         ["        45      (       a  UOX4n[        U[         ["        45      (       a  UOX4nXl        Xl        UU l        UU l        UU l        UU l        UU l        g )N )super__init__hidden_sizenum_hidden_layersnum_attention_headsattention_biasuse_qk_normintermediate_size
hidden_acthidden_dropout_probattention_dropoutprojection_dropoutinitializer_range	norm_typelayer_norm_eps
isinstancelisttuple
image_size
patch_sizenum_channelsuse_mask_token use_absolute_position_embeddingslayer_scale_init_valueuse_mean_pooling)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r%   r&   r'   r(   r)   r*   r+   kwargs	__class__s                         k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/internvl/configuration_internvl.pyr   InternVLVisionConfig.__init__Z   s    0 	"6"&!2#6 ,&!2$#6 !2"4!2",#-j4-#H#HZzNf
#-j4-#H#HZzNf
$$(,0P-&<# 0    )r   r   r   r   r   r%   r   r   r!   r*   r    r   r'   r   r&   r   r)   r(   r+   r   )
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r.   s   @r/   r   r      se    ?B #J%O :8)-"+11 11r1   r   c                   N   ^  \ rS rSrSrSr\\S.r        SU 4S jjr	Sr
U =r$ )InternVLConfig   ar  
This is the configuration class to store the configuration of a [`InternVLForConditionalGeneration`]. It is used to instantiate a
InternVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of InternVL3-1B.
e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `InternVisonConfig`):
        The config object or dictionary of the vision backbone.
    text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
        The config object or dictionary of the text backbone.
    image_token_id (`int`, *optional*, defaults to 151667):
        The image token index to encode the image prompt.
    image_seq_length (`int`, *optional*, defaults to 256):
        Number of image tokens to use per image patch.
    downsample_ratio (`float`, *optional*, defaults to 0.5):
        Factor by which to downsample the image.
    projector_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the projector.
    vision_feature_layer (`int`, *optional*, defaults to -1):
        The index of the layer to use as the image features.
    vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`.

```python
>>> from transformers import InternVLForConditionalGeneration, InternVLConfig

>>> # Initializing a InternVL style configuration
>>> configuration = InternVLConfig()

>>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
>>> model = InternVLForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```internvl)text_configr   c	                   > X0l         X@l        XPl        X`l        Xpl        Xl        [        U[        5      (       a  [        S0 UD6U l	        O.[        U[        5      (       a  Xl	        OUc  [        5       U l	        [        U[        5      (       a#  SU;   a  US   OSUS'   [        US      " S0 UD6nOUc  [        S   " 5       nX l        [        T
U ]4  " S0 U	D6  g )Nr7   qwen2r   )image_token_idimage_seq_lengthdownsample_ratioprojector_hidden_actvision_feature_layervision_feature_select_strategyr"   dictr   r   r   r@   r   r   )r,   r   r@   rC   rD   rE   rF   rG   rH   r-   r.   s             r/   r   InternVLConfig.__init__   s     - 0 0$8!$8!.L+mT**!5!F!FD';<<!."!5!7Dk4((EQU`E`L(AfmK%(\)BCRkRK (13K&"6"r1   )rE   rD   rC   rF   r@   r   rG   rH   )NNisP    g      ?r   default)r2   r3   r4   r5   r6   r7   r   r   sub_configsr   r9   r:   r;   s   @r/   r=   r=      s?    (T J",?STK #'0"# "#r1   r=   N)configuration_utilsr   autor   r   r   r=   __all__r   r1   r/   <module>rR      s:   " 4 -v1+ v1rP#% P#f "#3
4r1   