o
    Zh+                     @   s  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ dd	lmZ d
dlmZ eG dd deZeG dd de	ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeddG dd deZddgZdS )zPyTorch ViTMatte model.    )	dataclass)OptionalTupleN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigc                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )ImageMattingOutputa  
    Class for outputs of image matting models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss.
        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
           Estimated alpha values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchZFloatTensor__annotations__r   r   r   r    r   r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr      s   
 r   c                   @   s$   e Zd ZeZdZdZg Zdd ZdS )VitMattePreTrainedModelpixel_valuesTc                 C   sD   t |tjr|jjjd| jjd |jd ur |jj	  d S d S d S )Ng        )meanZstd)

isinstancer   Conv2dweightdataZnormal_configZinitializer_rangebiasZzero_)selfmoduler   r   r   _init_weights@   s   
z%VitMattePreTrainedModel._init_weightsN)	r   r   r   r   Zconfig_classZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesr%   r   r   r   r   r   9   s    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r   c                    sB   t    tj||d||dd| _tj||jd| _t | _	d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr"   )Zeps)
super__init__r   r   convBatchNorm2dZbatch_norm_eps
batch_normReLUrelu)r#   r!   r(   r)   r+   r,   	__class__r   r   r.   L   s   
zVitMatteBasicConv3x3.__init__c                 C   s"   |  |}| |}| |}|S N)r/   r1   r3   r#   Zhidden_stater   r   r   forwardY   s   


zVitMatteBasicConv3x3.forward)r'   r   r   r   r   r   r.   r8   __classcell__r   r   r4   r   r&   G   s    r&   c                       (   e Zd ZdZ fddZdd Z  ZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    s   t    d}|jd ur|jj}|j}t | _|g| | _t	t
| jd D ]}| j| }| j|d  }| jt||| q'd S )N   r   )r-   r.   Zbackbone_configZnum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr&   )r#   r!   r(   r)   iZin_chan_Z	out_chan_r4   r   r   r.   f   s   



zVitMatteConvStream.__init__c                 C   sJ   d|i}|}t t| jD ]}| j| |}dt|d  }|||< q|S )NZdetailed_feature_map_0detailed_feature_map_r   )rB   rC   r@   str)r#   r   Zout_dictZ
embeddingsrE   Zname_r   r   r   r8   y   s   
zVitMatteConvStream.forwardr9   r   r   r4   r   r<   a   s    r<   c                       r;   )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                    s"   t    t|||ddd| _d S )Nr   )r+   r,   )r-   r.   r&   r/   )r#   r!   r(   r)   r4   r   r   r.      s   
zVitMatteFusionBlock.__init__c                 C   s4   t jj|dddd}tj||gdd}| |}|S )Nr'   ZbilinearF)Zscale_factormodeZalign_cornersr   )dim)r   Z
functionalZinterpolater   catr/   )r#   featuresZdetailed_feature_mapZupscaled_featuresoutr   r   r   r8      s   
zVitMatteFusionBlock.forwardr9   r   r   r4   r   rH      s    rH   c                       r;   )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                    sZ   t    |jd }d}ttj||ddddt|tdtj|ddddd| _d S )N   r   r   )r*   r+   r,   Tr   )	r-   r.   fusion_hidden_sizesr   Z
Sequentialr   r0   r2   matting_convs)r#   r!   r(   Zmid_channelsr4   r   r   r.      s   


zVitMatteHead.__init__c                 C   s   |  |}|S r6   )rR   r7   r   r   r   r8      s   
zVitMatteHead.forwardr9   r   r   r4   r   rN      s    rN   c                       r;   )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c              	      s   t    t|jt|jd krtd|| _t|| _| jj	| _	t
 | _|jg|j | _tt| jd D ]}| jt|| j| | j	|d    | j|d  d q8t|| _d S )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r!   r(   r)   )r-   r.   rC   rQ   r>   
ValueErrorr!   r<   
convstreamrA   r   r?   fusion_blocksZhidden_sizeZfusion_channelsrB   rD   rH   rN   matting_head)r#   r!   rE   r4   r   r   r.      s&   



z$VitMatteDetailCaptureModule.__init__c                 C   s`   |  |}tt| jD ]}dtt| j| d  }| j| ||| }qt| |}|S )NrF   r   )rU   rB   rC   rV   rG   r   ZsigmoidrW   )r#   rL   r   Zdetail_featuresrE   Zdetailed_feature_map_namer   r   r   r   r8      s   
z#VitMatteDetailCaptureModule.forwardr9   r   r   r4   r   rS      s    rS   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    )Zcustom_introc                       sb   e Zd Z fddZe					ddeej dee dee deej dee f
d	d
Z	  Z
S )VitMatteForImageMattingc                    s2   t  | || _t|| _t|| _|   d S r6   )r-   r.   r!   r
   backbonerS   decoderZ	post_init)r#   r!   r4   r   r   r.      s
   

z VitMatteForImageMatting.__init__Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}|dur(td| jj|||d}|jd }| ||}	|sR|	f|dd  }
|durP|f|
 S |
S t	||	|j
|jdS )a8  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```NzTraining is not yet supported)r\   r[   rO   r   )r   r   r   r   )r!   Zuse_return_dictr\   r[   NotImplementedErrorrY   Zforward_with_filtered_kwargsZfeature_mapsrZ   r   r   r   )r#   r   r[   r\   r]   r^   r   ZoutputsrL   r   outputr   r   r   r8      s*   )
zVitMatteForImageMatting.forward)NNNNN)r   r   r   r.   r	   r   r   ZTensorboolr8   r:   r   r   r4   r   rX      s&    
rX   )r   dataclassesr   typingr   r   r   r   Zmodeling_utilsr   utilsr   r	   Zutils.backbone_utilsr
   Zconfiguration_vitmatter   r   r   Moduler&   r<   rH   rN   rS   rX   __all__r   r   r   r   <module>   s.   #)Q