o
    Zh;                     @   s`   d Z ddlmZ ddlmZ eeZG dd deZG dd deZ	G dd	 d	eZ
d	gZd
S )zIdefics model configuration   )PretrainedConfig)loggingc                       sF   e Zd ZdZdZddiZ							
						d fdd	Z  ZS )IdeficsVisionConfiga	  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        embed_dim (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of image channels.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
    Zidefics_visionhidden_size	embed_dim                   r   geluh㈵>        {Gz?      ?c                    s^   || _ || _|| _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _t jdi | d S N )r   
image_sizeintermediate_size
patch_sizenum_hidden_layersnum_attention_headsnum_channelslayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor
hidden_actsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__r   `/var/www/auris/lib/python3.10/site-packages/transformers/models/idefics/configuration_idefics.pyr    J   s   zIdeficsVisionConfig.__init__)r   r   r	   r
   r   r   r   r   r   r   r   r   )__name__
__module____qualname____doc__
model_typeZattribute_mapr    __classcell__r   r   r#   r%   r      s$    'r   c                       s2   e Zd ZdZdZ						d
 fdd		Z  ZS )IdeficsPerceiverConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        use_resampler (`bool`, *optional*, defaults to `False`):
            Whether or not to use the resampler
        resampler_n_latents (`int`, *optional*, defaults to 64):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 6):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
            Whether or not to use qk layer norms in perceiver
    Zidefics_percieverF@      r   `   c                    s:   || _ || _|| _|| _|| _|| _t jdi | d S r   )use_resamplerresampler_n_latentsresampler_depthresampler_n_headsresampler_head_dimqk_layer_norms_perceiverr   r    )r!   r0   r1   r2   r3   r4   r5   r"   r#   r   r%   r       s   
zIdeficsPerceiverConfig.__init__)Fr-   r.   r   r/   F)r&   r'   r(   r)   r*   r    r+   r   r   r#   r%   r,   j   s    r,   c                       sh   e Zd ZdZdZeedZddddddd	d
ddd	ddddddddddg ddg dddf fdd	Z  Z	S )IdeficsConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~IdeficsModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
            Initialization type for the alphas.
        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
            Attention.
        alpha_type (`str`, *optional*, defaults to `"float"`):
            Whether the gating alphas should be vectors or single floats.
        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0)
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1)
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        cross_layer_interval (`int`, *optional*, default to 1)
            Interval for cross attention (from text to image) layers.
        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing text layers when `freeze_text_layers` is `True`
        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict

    Example:

    ```python
    >>> from transformers import IdeficsModel, IdeficsConfig

    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()

    >>> # Initializing a model from the idefics-9b style configuration
    >>> model = IdeficsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zidefics)perceiver_configvision_configi }      i   i +  r   r   Zsilur   Zzerosfloatgư>T      FNc                    s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d u rKt | _nt|trYtdi || _nt|tra|| _|d u rjt | _nt|trxtdi || _nt|tr|| _t jd||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingsr   )
vocab_sizeadditional_vocab_sizer   r   r   r   dropoutr   r   alpha_initializeralphas_initializer_range
alpha_typerms_norm_eps	use_cachecross_layer_intervalqk_layer_normsfreeze_vision_layersfreeze_text_layersfreeze_text_module_exceptionsfreeze_vision_module_exceptionsfreeze_lm_headr0   r,   r7   
isinstancedictr   r8   r   r    )r!   rA   rB   r   r   r   r   rC   r   r   rD   rE   rF   rG   rH   r=   r>   r?   r@   rI   rJ   rL   rM   rO   rK   rN   r0   r8   r7   r"   r#   r   r%   r       sT    






zIdeficsConfig.__init__)
r&   r'   r(   r)   r*   r,   r   Zsub_configsr    r+   r   r   r#   r%   r6      sB    P
r6   N)r)   Zconfiguration_utilsr   utilsr   Z
get_loggerr&   loggerr   r,   r6   __all__r   r   r   r%   <module>   s   
M0 
,