o
    Zh|9                     @   sr   d Z ddlmZ ddlmZ eeZG dd deZG dd deZ	G dd	 d	eZ
G d
d deZg dZdS )zSAM model configuration   )PretrainedConfig)loggingc                       s4   e Zd ZdZdZ							d fd	d
	Z  ZS )SamPromptEncoderConfiga  
    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
    a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
    prompt_encoder_config            geluư>c           	         sJ   t  jdi | || _|| _|| _|| | _|| _|| _|| _|| _	d S N )
super__init__hidden_size
image_size
patch_sizeZimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)	selfr   r   r   r   r   r   r   kwargs	__class__r   X/var/www/auris/lib/python3.10/site-packages/transformers/models/sam/configuration_sam.pyr   3   s   

zSamPromptEncoderConfig.__init__)r   r   r   r   r	   r
   r   __name__
__module____qualname____doc__base_config_keyr   __classcell__r   r   r   r   r      s    r   c                       s:   e Zd ZdZdZ											d fd
d	Z  ZS )SamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    mask_decoder_configr   relu         r   r   c                    sR   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _d S r   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r)   r*   r+   r,   r-   r.   r/   r   r   r   r   r   r   m   s   
zSamMaskDecoderConfig.__init__)
r   r%   r&   r'   r(   r'   r   r   r   r   r   r   r   r   r   r#   I   s    !r#   c                       sT   e Zd ZdZdZdZddddddd	d
ddddddddg dddf fdd	Z  ZS )SamVisionConfiga  
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        output_channels (`int`, *optional*, defaults to 256):
            Dimensionality of the output channels in the Patch Encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        image_size (`int`, *optional*, defaults to 1024):
            Expected resolution. Target size of the resized input image.
        patch_size (`int`, *optional*, defaults to 16):
            Size of the patches to be extracted from the input image.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string)
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to query, key, value projections.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of mlp hidden dim to embedding dim.
        use_abs_pos (`bool`, *optional*, defaults to `True`):
            Whether to use absolute position embedding.
        use_rel_pos (`bool`, *optional*, defaults to `True`):
            Whether to use relative position embedding.
        window_size (`int`, *optional*, defaults to 14):
            Window size for relative position.
        global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            The indexes of the global attention layers.
        num_pos_feats (`int`, *optional*, defaults to 128):
            The dimensionality of the position embedding.
        mlp_dim (`int`, *optional*):
            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
            hidden_size`.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamVisionModel,
    ... )

    >>> # Initializing a SamVisionConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamVisionConfig()

    >>> # Initializing a SamVisionModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```vision_configZsam_vision_modeli   r      r   r   r   r
   r   g        g|=Tg      @   )r'      r(         Nc                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|d u rLt|| | _d S || _d S r   )r   r   r   output_channelsr*   r+   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr)   )r   r   r7   r*   r+   r8   r   r   r   r   r9   r:   r;   r<   r=   r>   r?   r@   rA   r)   r   r   r   r   r      s(   $zSamVisionConfig.__init__)r   r   r   r    r!   
model_typer   r"   r   r   r   r   r0      s0    Dr0   c                       s:   e Zd ZdZdZeeedZ				d fdd	Z	  Z
S )		SamConfiga  
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```Zsam)r   r$   r1   N{Gz?c                    s   t  jdi | |d ur|ni }|d ur|ni }|d ur|ni }t|tr*| }t|tr3| }t|tr<| }tdi || _tdi || _tdi || _	|| _
d S r   )r   r   
isinstancer0   to_dictr   r#   r1   r   r$   r:   )r   r1   r   r$   r:   r   r   r   r   r   7  s   



zSamConfig.__init__)NNNrE   )r   r   r   r    rC   r   r#   r0   Zsub_configsr   r"   r   r   r   r   rD      s    1rD   )rD   r#   r   r0   N)r    Zconfiguration_utilsr   utilsr   Z
get_loggerr   loggerr   r#   r0   rD   __all__r   r   r   r   <module>   s   
1?vS