
    fTh.:                         S r SSKJr  SSKJr  \R
                  " \5      r " S S\5      r " S S\5      r	 " S S	\5      r
/ S
Qrg)zBlip model configuration   )PretrainedConfig)loggingc                   `   ^  \ rS rSrSrSrSr                    SU 4S jjrSrU =r	$ )BlipTextConfig   a  
This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP
text model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the `BlipText` used by the [base
architectures](https://huggingface.co/Salesforce/blip-vqa-base).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 30524):
        Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`BlipModel`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    encoder_hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers from the vision model.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 512):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-12):
        The epsilon used by the layer normalization layers.
    hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    bos_token_id (`int`, *optional*, defaults to 30522):
        The id of the `beginning-of-sequence` token.
    eos_token_id (`int`, *optional*, defaults to 2):
        The id of the `end-of-sequence` token.
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the `padding` token.
    sep_token_id (`int`, *optional*, defaults to 102):
        The id of the `separator` token.
    is_decoder (`bool`, *optional*, defaults to `True`):
        Whether the model is used as a decoder.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    label_smoothing (float, *optional*):
        A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
        become a mixture of the original ground truth and a uniform distribution as described in
        `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.

Example:

```python
>>> from transformers import BlipTextConfig, BlipTextModel

>>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration
>>> configuration = BlipTextConfig()

>>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration
>>> model = BlipTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```blip_text_modeltext_configc                    > [         TU ]  " SUUUUS.UD6  Xl        X l        X0l        X@l        XPl        Xl        X`l        Xpl	        Xl
        Xl        Xl        Xl        Xl        UU l        UU l        UU l        g )N)pad_token_idbos_token_ideos_token_idsep_token_id )super__init__
vocab_sizehidden_sizeencoder_hidden_sizeintermediate_sizeprojection_dimhidden_dropout_probnum_hidden_layersnum_attention_headsmax_position_embeddingslayer_norm_eps
hidden_actinitializer_rangeattention_probs_dropout_prob
is_decoder	use_cachelabel_smoothing)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   kwargs	__class__s                         c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/blip/configuration_blip.pyr   BlipTextConfig.__init__b   s    0 	 	
%%%%		

 	
 %&#6 !2,#6 !2#6 '>$,$!2,H)$".    )r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r    r   )i<w     r(      r(            gelug-q=        r.   {Gz?i:w         f   TTr.   
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r$   s   @r%   r   r      s^    DL #J#O  #%(+// //r'   r   c                   N   ^  \ rS rSrSrSrSr           SU 4S jjrSrU =r	$ )BlipVisionConfig   az	  
This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a
BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a
configuration defaults will yield a similar configuration to that of the Blip-base
[Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    image_size (`int`, *optional*, defaults to 384):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-5):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 1e-10):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

Example:

```python
>>> from transformers import BlipVisionConfig, BlipVisionModel

>>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration
>>> configuration = BlipVisionConfig()

>>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration
>>> model = BlipVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```blip_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        Xl        g )Nr   )r   r   r   r   r   r   r   
patch_size
image_sizer   attention_dropoutr   r   )r"   r   r   r   r   r   rE   rD   r   r   rF   r   r#   r$   s                r%   r   BlipVisionConfig.__init__   sT     	"6"&!2,!2#6 $$!2!2,$r'   )rF   r   r   rE   r   r   r   r   r   rD   r   )r(   r)   r,   r*   r*   i     r-   gh㈵>r.   g|=r3   r=   s   @r%   r?   r?      sB    .` %J%O % %r'   r?   c                   f   ^  \ rS rSrSrSr\\S.r      S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )
BlipConfig   a	  
[`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate
a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
a configuration with the defaults will yield a similar configuration to that of the BLIP-base
[Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`BlipTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`BlipVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation.
    image_text_hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the hidden state of the image-text fusion layer.
    label_smoothing (float, optional, *optional*, defaults to 0.0):
        A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
        become a mixture of the original ground truth and a uniform distribution as described in
        `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import BlipConfig, BlipModel

>>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration
>>> configuration = BlipConfig()

>>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration
>>> model = BlipModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig

>>> # Initializing a BLIPText and BLIPVision configuration
>>> config_text = BlipTextConfig()
>>> config_vision = BlipVisionConfig()

>>> config = BlipConfig.from_text_vision_configs(config_text, config_vision)
```blipr	   rB   c                 d  > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        U R                  R                  U R
                  l	        X0l
        X@l        SU l        SU l        XPl        X`l        g )NzO`text_config` is `None`. Initializing the `BlipTextConfig` with default values.zS`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.g      ?r/   r   )r   r   loggerinfor   r	   r?   rB   r   r   r   logit_scale_init_valueinitializer_factorr   image_text_hidden_sizer!   )	r"   r	   rB   r   rQ   rS   r!   r#   r$   s	           r%   r   BlipConfig.__init__  s     	"6"KKKij MKKmn)8K8->>/3/A/A/M/M,,&<#"%!%&<#.r'   r	   rB   c                 P    U " SUR                  5       UR                  5       S.UD6$ )z
Instantiate a [`BlipConfig`] (or a derived class) from blip text model configuration and blip vision model
configuration.

Returns:
    [`BlipConfig`]: An instance of a configuration object
rM   r   )to_dict)clsr	   rB   r#   s       r%   from_text_vision_configs#BlipConfig.from_text_vision_configs<  s,     f{224MDYDYD[f_effr'   )rS   rR   r   r!   rQ   r   r	   rB   )NNr,   g/L
F@   r.   )r4   r5   r6   r7   r8   r9   r   r?   sub_configsr   classmethodrX   r;   r<   r=   s   @r%   rJ   rJ      s\    0d J"0CSTK %"/@ 	g> 	gRb 	g 	gr'   rJ   )rJ   r   r?   N)r8   configuration_utilsr   utilsr   
get_loggerr4   rO   r   r?   rJ   __all__r   r'   r%   <module>ra      s[     3  
		H	%y/% y/xO%' O%d`g! `gF ?r'   