
    fTh+                        S r SSKJr  SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
JrJr  SS	KJr  S
SKJr  \ " S S\5      5       r\ " S S\	5      5       r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r " S S\R&                  5      r\" SS9 " S S\5      5       rSS/rg)zPyTorch ViTMatte model.    )	dataclass)OptionalTupleN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	ImageMattingOutput   a{  
Class for outputs of image matting models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss.
    alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
       Estimated alpha values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossalphashidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   __static_attributes__r       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   r      sg    ( )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r   r   c                   *    \ rS rSr\rSrSr/ rS r	Sr
g)VitMattePreTrainedModel9   pixel_valuesTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         g g g )Ng        )meanstd)

isinstancer   Conv2dweightdatanormal_configinitializer_rangebiaszero_)selfmodules     r    _init_weights%VitMattePreTrainedModel._init_weights@   sd    fbii((MM&&CT[[5R5R&S{{&  &&( ' )r   r   N)r   r   r   r   r   config_classmain_input_namesupports_gradient_checkpointing_no_split_modulesr3   r   r   r   r    r"   r"   9   s    !L$O&*#)r   r"   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )VitMatteBasicConv3x3G   zH
Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
c           	         > [         TU ]  5         [        R                  " UUSUUSS9U l        [        R
                  " X1R                  S9U l        [        R                  " 5       U l	        g )Nr   F)in_channelsout_channelskernel_sizestridepaddingr/   )eps)
super__init__r   r)   convBatchNorm2dbatch_norm_eps
batch_normReLUrelu)r1   r-   r=   r>   r@   rA   	__class__s         r    rD   VitMatteBasicConv3x3.__init__L   sU    II#%
	 ..;P;PQGGI	r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)rE   rH   rJ   r1   hidden_states     r    forwardVitMatteBasicConv3x3.forwardY   s2    yy.|4yy.r   )rH   rE   rJ   )   r   	r   r   r   r   r   rD   rQ   r   __classcell__rK   s   @r    r:   r:   G   s     r   r:   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteConvStreama   z[
Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
c                   > [         TU ]  5         SnUR                  b  UR                  R                  nUR                  n[
        R                  " 5       U l        U/U-   U l        [        [        U R                  5      S-
  5       HI  nU R                  U   nU R                  US-      nU R                  R                  [        XU5      5        MK     g )N   r   )rC   rD   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr:   )r1   r-   r=   r>   iin_chan_	out_chan_rK   s          r    rD   VitMatteConvStream.__init__f   s     !!- 00==K55]]_
&-,6s4??+a/0Aq)HA.IJJ26YOP 1r   c                     SU0nUn[        [        U R                  5      5       H-  nU R                  U   " U5      nS[        US-   5      -   nX2U'   M/     U$ )Ndetailed_feature_map_0detailed_feature_map_r   )rb   rc   r`   str)r1   r$   out_dict
embeddingsre   name_s         r    rQ   VitMatteConvStream.forwardy   sZ    ,l;!
s4::'AAz2J+c!a%j8E(UO (
 r   )ra   r`   rT   rV   s   @r    rX   rX   a   s    Q& r   rX   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteFusionBlock   zT
Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
c                 D   > [         TU ]  5         [        XUSSS9U l        g )Nr   )r@   rA   )rC   rD   r:   rE   )r1   r-   r=   r>   rK   s       r    rD   VitMatteFusionBlock.__init__   s"    (lST^_`	r   c                     [         R                  R                  USSSS9n[        R                  " X#/SS9nU R                  U5      nU$ )NrS   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catrE   )r1   featuresdetailed_feature_mapupscaled_featuresouts        r    rQ   VitMatteFusionBlock.forward   sH    MM55hQU_ot5uii-AqIiin
r   )rE   rT   rV   s   @r    rr   rr      s    a r   rr   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteHead   zB
Simple Matting Head, containing only conv3x3 and conv1x1 layers.
c                 &  > [         TU ]  5         UR                  S   nSn[        R                  " [        R
                  " X#SSSS9[        R                  " U5      [        R                  " S5      [        R
                  " USSSSS95      U l        g )N   r   r   )r?   r@   rA   Tr   )	rC   rD   fusion_hidden_sizesr   
Sequentialr)   rF   rI   matting_convs)r1   r-   r=   mid_channelsrK   s       r    rD   VitMatteHead.__init__   sr    004]]IIkQqRSTNN<(GGDMIIlA1QJ	
r   c                 (    U R                  U5      nU$ rN   r   rO   s     r    rQ   VitMatteHead.forward   s    )),7r   r   rT   rV   s   @r    r   r      s    
 r   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteDetailCaptureModule   z?
Simple and lightweight Detail Capture Module for ViT Matting.
c           
        > [         TU ]  5         [        UR                  5      [        UR                  5      S-   :w  a  [        S5      eXl        [        U5      U l        U R                  R                  U l	        [        R                  " 5       U l        UR                  /UR                  -   U l        [        [        U R                  5      S-
  5       HX  nU R                  R!                  [#        UU R                  U   U R                  US-   *    -   U R                  US-      S95        MZ     [%        U5      U l        g )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r-   r=   r>   )rC   rD   rc   r   r^   
ValueErrorr-   rX   
convstreamra   r   r_   fusion_blockshidden_sizefusion_channelsrb   rd   rr   r   matting_head)r1   r-   re   rK   s      r    rD   $VitMatteDetailCaptureModule.__init__   s   v))*c&2P2P.QTU.UUq  ,V4//44]]_ & 2 23f6P6PPs4//0145A%%#! $ 4 4Q 7$//APQE(:S S!%!5!5a!e!< 6 )0r   c                 :   U R                  U5      n[        [        U R                  5      5       HB  nS[	        [        U R                  5      U-
  S-
  5      -   nU R                  U   " XU   5      nMD     [
        R                  " U R                  U5      5      nU$ )Nrk   r   )r   rb   rc   r   rl   r   sigmoidr   )r1   r   r$   detail_featuresre   detailed_feature_map_namer   s          r    rQ   #VitMatteDetailCaptureModule.forward   s    //,7s4--./A(?#c$J\J\F]`aFadeFeBf(f%))!,XG`7abH 0 t00:;r   )r-   ra   r   r   r   r   rT   rV   s   @r    r   r      s    12 r   r   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    )custom_introc                      ^  \ rS rSrU 4S jr\     S
S\\R                     S\\	   S\\	   S\\R                     S\\	   4
S jj5       r
S	rU =r$ )VitMatteForImageMatting   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g rN   )rC   rD   r-   r   backboner   decoder	post_init)r1   r-   rK   s     r    rD    VitMatteForImageMatting.__init__   s9     %f-26: 	r   r$   output_attentionsoutput_hidden_stateslabelsreturn_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnUb  [	        S5      eU R
                  R                  XUS9nUR                  S   nU R                  X5      n	U(       d  U	4USS -   n
Ub  U4U
-   $ U
$ [        UU	UR                  UR                  S9$ )ap  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth image matting for computing the loss.

Examples:

```python
>>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
>>> import torch
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download

>>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
>>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
... )
>>> image = Image.open(filepath).convert("RGB")
>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
... )
>>> trimap = Image.open(filepath).convert("L")

>>> # prepare image + trimap for the model
>>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

>>> with torch.no_grad():
...     alphas = model(**inputs).alphas
>>> print(alphas.shape)
torch.Size([1, 1, 640, 960])
```NzTraining is not yet supported)r   r   r   r   )r   r   r   r   )r-   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r1   r$   r   r   r   r   r   outputsr   r   outputs              r    rQ   VitMatteForImageMatting.forward   s    R &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<<Wh = 
 ''+h5Y,F)-)9TGf$EvE!!//))	
 	
r   )r   r-   r   )NNNNN)r   r   r   r   rD   r
   r   r   TensorboolrQ   r   rU   rV   s   @r    r   r      s      04,0/3)-&*B
u||,B
 $D>B
 'tn	B

 &B
 d^B
 B
r   r   )r   dataclassesr   typingr   r   r   r   modeling_utilsr   utilsr	   r
   utils.backbone_utilsr   configuration_vitmatter   r   r"   Moduler:   rX   rr   r   r   r   __all__r   r   r    <module>r      s     ! "   - 0 1 2 : : :6 
)o 
) 
)299 4   F")) "299 0&")) &R 
N
5 N

N
b %&?
@r   