
    fTh|9                         S r SSKJr  SSKJr  \R
                  " \5      r " S S\5      r " S S\5      r	 " S S	\5      r
 " S
 S\5      r/ SQrg)zSAM model configuration   )PretrainedConfig)loggingc                   B   ^  \ rS rSrSrSr       SU 4S jjrSrU =r$ )SamPromptEncoderConfig   a  
This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
a similar configuration to that of the SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the hidden states.
    image_size (`int`, *optional*, defaults to 1024):
        The expected output resolution of the image.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    mask_input_channels (`int`, *optional*, defaults to 16):
        The number of channels to be fed to the `MaskDecoder` module.
    num_point_embeddings (`int`, *optional*, defaults to 4):
        The number of point embeddings to be used.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function in the encoder and pooler.
prompt_encoder_configc                    > [         T	U ]  " S0 UD6  Xl        X l        X0l        X#-  U l        X@l        XPl        X`l        Xpl	        g N )
super__init__hidden_size
image_size
patch_sizeimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)
selfr   r   r   r   r   r   r   kwargs	__class__s
            a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/sam/configuration_sam.pyr   SamPromptEncoderConfig.__init__3   sG     	"6"&$$$.$<!#6 $8!$,    )r   r   r   r   r   r   r   r   )         r      geluư>	__name__
__module____qualname____firstlineno____doc__base_config_keyr   __static_attributes____classcell__r   s   @r   r   r      s0    0 .O - -r   r   c                   H   ^  \ rS rSrSrSr          SU 4S jjrSrU =r$ )SamMaskDecoderConfigI   aO  
This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
will yield a similar configuration to that of the SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 256):
        Dimensionality of the hidden states.
    hidden_act (`str`, *optional*, defaults to `"relu"`):
        The non-linear activation function used inside the `SamMaskDecoder` module.
    mlp_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 2):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    attention_downsample_rate (`int`, *optional*, defaults to 2):
        The downsampling rate of the attention layer.
    num_multimask_outputs (`int`, *optional*, defaults to 3):
        The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
    iou_head_depth (`int`, *optional*, defaults to 3):
        The number of layers in the IoU head module.
    iou_head_hidden_dim (`int`, *optional*, defaults to 256):
        The dimensionality of the hidden states in the IoU head module.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.

mask_decoder_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        g r
   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r1   r2   r3   r4   r5   r6   r7   r   r   r   s               r   r   SamMaskDecoderConfig.__init__m   sO     	"6"&$!2#6 )B&%:",#6 ,r   )
r4   r   r   r6   r7   r   r1   r3   r2   r5   )
r   relui         r:   r   r   r   r!   r"   r+   s   @r   r-   r-   I   s:    B ,O "#- -r   r-   c                   b   ^  \ rS rSrSrSrSrSSSSSS	S
SSSSSSSSS/ SQSS4U 4S jjrSrU =r	$ )SamVisionConfig   a&  
This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
defaults will yield a similar configuration to that of the SAM ViT-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    output_channels (`int`, *optional*, defaults to 256):
        Dimensionality of the output channels in the Patch Encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input image.
    image_size (`int`, *optional*, defaults to 1024):
        Expected resolution. Target size of the resized input image.
    patch_size (`int`, *optional*, defaults to 16):
        Size of the patches to be extracted from the input image.
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string)
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 1e-10):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    qkv_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to query, key, value projections.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        Ratio of mlp hidden dim to embedding dim.
    use_abs_pos (`bool`, *optional*, defaults to `True`):
        Whether to use absolute position embedding.
    use_rel_pos (`bool`, *optional*, defaults to `True`):
        Whether to use relative position embedding.
    window_size (`int`, *optional*, defaults to 14):
        Window size for relative position.
    global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
        The indexes of the global attention layers.
    num_pos_feats (`int`, *optional*, defaults to 128):
        The dimensionality of the position embedding.
    mlp_dim (`int`, *optional*):
        The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
        hidden_size`.

Example:

```python
>>> from transformers import (
...     SamVisionConfig,
...     SamVisionModel,
... )

>>> # Initializing a SamVisionConfig with `"facebook/sam-vit-huge"` style configuration
>>> configuration = SamVisionConfig()

>>> # Initializing a SamVisionModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
>>> model = SamVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```vision_configsam_vision_modeli   r      r   r   r   r    r!   g        g|=Tg      @   )r:      r;         Nc                 >  > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        Uc  [)        X-  5      U l        g UU l        g r
   )r   r   r   output_channelsr2   r3   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr1   )r   r   rG   r2   r3   rH   r   r   r   r   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   r1   r   r   s                        r   r   SamVisionConfig.__init__   s    . 	"6"&.!2#6 ($$$,!2!2 "&&&#6 *7>s;23Gr   )rI   rP   r   r   r   rJ   r   r1   rL   r3   rH   r2   rQ   rG   r   rK   rM   rN   rO   )
r#   r$   r%   r&   r'   r(   
model_typer   r)   r*   r+   s   @r   r=   r=      s]    BH &O#J ))+T +Tr   r=   c                   H   ^  \ rS rSrSrSr\\\S.r	    SU 4S jjr
SrU =r$ )	SamConfig   a"  
[`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
        Dictionary of configuration options used to initialize [`SamVisionConfig`].
    prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
    mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
        Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import (
...     SamVisionConfig,
...     SamPromptEncoderConfig,
...     SamMaskDecoderConfig,
...     SamModel,
... )

>>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
>>> configuration = SamConfig()

>>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
>>> model = SamModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

>>> # Initializing SAM vision, SAM Q-Former and language model configurations
>>> vision_config = SamVisionConfig()
>>> prompt_encoder_config = SamPromptEncoderConfig()
>>> mask_decoder_config = SamMaskDecoderConfig()

>>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
```sam)r   r/   r?   c                   > [         TU ]  " S0 UD6  Ub  UO0 nUb  UO0 nUb  UO0 n[        U[        5      (       a  UR	                  5       n[        U[
        5      (       a  UR	                  5       n[        U[        5      (       a  UR	                  5       n[        S0 UD6U l        [        S0 UD6U l        [        S0 UD6U l	        X@l
        g r
   )r   r   
isinstancer=   to_dictr   r-   r?   r   r/   rJ   )r   r?   r   r/   rJ   r   r   s         r   r   SamConfig.__init__7  s     	"6")6)B9N9Z 5`b5H5T1Z\m_55)113M+-CDD$9$A$A$C!)+?@@"5"="="?,=}=%;%T>S%T"#7#N:M#N !2r   )rJ   r/   r   r?   )NNNg{Gz?)r#   r$   r%   r&   r'   rT   r   r-   r=   sub_configsr   r)   r*   r+   s   @r   rV   rV      s8    /b J!73(K " 3 3r   rV   )rV   r-   r   r=   N)r'   configuration_utilsr   utilsr   
get_loggerr#   loggerr   r-   r=   rV   __all__r   r   r   <module>rc      sh     3  
		H	%.-- .-b<-+ <-~sT& sTlP3  P3f ]r   