
    fTh                        S r SSKrSSKJr  SSKJrJrJrJ	r	J
r
Jr  SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJrJr  SSKJrJrJ r J!r!  SSK"J#r#  SSK$J%r%  \ RL                  " \'5      r(\ " S S\5      5       r)\ " S S\5      5       r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\RV                  5      r. SPS\RV                  S\R^                  S\R^                  S\R^                  S\\R^                     S\0S \04S! jjr1 " S" S#\RV                  5      r2 " S$ S%\RV                  5      r3 " S& S'\RV                  5      r4 " S( S)\RV                  5      r5 " S* S+\RV                  5      r6 " S, S-\RV                  5      r7 " S. S/\RV                  5      r8 " S0 S1\RV                  5      r9S2 r: " S3 S4\RV                  5      r; " S5 S6\RV                  5      r< " S7 S8\RV                  5      r= " S9 S:\RV                  5      r>\ " S; S<\5      5       r?\ " S= S>\?5      5       r@ " S? S@\RV                  5      rA " SA SB\RV                  5      rB " SC SD\RV                  5      rC\" SESF9 " SG SH\?5      5       rD " SI SJ\RV                  5      rE " SK SL\RV                  5      rF\ " SM SN\?5      5       rG/ SOQrHg)QzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableListOptionalSetTupleUnion)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)load_backbone   )	DPTConfigc                   t    \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Srg)*BaseModelOutputWithIntermediateActivations+   a  
Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
in the context of Vision models.:

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
Nlast_hidden_states.intermediate_activations )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r    r   __static_attributes__r!       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   +   s?    	 7;!2!23:HLhuU->->-C'DELr+   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)4BaseModelOutputWithPoolingAndIntermediateActivations<   a  
Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
activations that can be used by the model at later stages.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
Nlast_hidden_statepooler_output.hidden_states
attentionsr    r!   )r"   r#   r$   r%   r&   r0   r   r'   r(   r)   r1   r2   r   r3   r    r*   r!   r+   r,   r.   r.   <   s    6 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>HLhuU->->-C'DELr+   r.   c            	          ^  \ rS rSrSrSU 4S jjrSS jr SS\R                  S\	S\	S\R                  4S	 jjr
S
rU =r$ )DPTViTHybridEmbeddings`   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 x  > [         T
U ]  5         UR                  UR                  pCUR                  UR
                  pe[        U[        R                  R                  5      (       a  UOX34n[        U[        R                  R                  5      (       a  UOXD4nUS   US   -  US   US   -  -  n[        U5      U l        U R                  R                  S   n[        U R                  R                  5      S:w  a+  [        S[        U R                  R                  5       35      eSS/U l        Uc  UR                   n	U	SS  nU	S   nOG[        U[        R                  R                  5      (       a  UOX"4nU R                  R                  S   nX0l        US   U l        XPl        ["        R$                  " XSS9U l        ["        R(                  " [*        R,                  " SSUR
                  5      5      U l        ["        R(                  " [*        R,                  " SUS-   UR
                  5      5      U l        g )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper
   Conv2d
projection	Parameterr'   zeros	cls_tokenposition_embeddings)selfconfigfeature_sizer>   r?   r@   rA   num_patchesfeature_dimfeat_map_shape	__class__s             r,   r=   DPTViTHybridEmbeddings.__init__g   s   !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q R RYeXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r+   c                 `   US S 2S U24   nUSUS 24   n[        [        U5      S-  5      nUR                  SXwS5      R                  SSSS5      n[        R
                  R                  XbU4SS9nUR                  SSSS5      R                  SX#-  S5      n[        R                  " XV/SS	9nU$ 
Nr         ?r   r8   r      bilinear)sizemodedim)	r   rH   reshapepermuter
   
functionalinterpolater'   catrR   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r,   _resize_pos_embed(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]2NVVWXZ[]^`abmm//UdBelv/w!))!Q15==aAQAceghJ4!<r+   pixel_valuesinterpolate_pos_encodingreturn_dictreturnc                    UR                   u  pEpgXPR                  :w  a  [        S5      eU(       dV  X`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U R
                  X`R                  -  XpR                  -  5      nU R                  U5      n	U	R                  S   n
U R                   Vs/ s H  oR                  U   PM     nnU R                  U
5      R                  S	5      R                  SS	5      nU R                  R                  USS5      n[        R                   " X4SS
9nX-   nU(       d  X4$ [#        UUS9$ s  snf )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r8   r]   ra   )r   r    )shaper@   rI   r>   rp   rQ   r?   rF   feature_mapsrJ   rM   flatten	transposerP   expandr'   rg   r   )rR   rr   rs   rt   
batch_sizer@   heightwidthrQ   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                  r,   forwardDPTViTHybridEmbeddings.forward   s    3?2D2D/
&,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?//AY
 --5"//3 RVQpQpqQp < <U CQpq__X.66q9CCAqI
^^**:r2>
YY
7Q?
  5
55 :)%9
 	
  rs   *E?)rF   rP   r>   r@   r?   rQ   rM   rJ   Nr   )FF)r"   r#   r$   r%   r&   r=   rp   r'   Tensorboolr   r*   __classcell__rX   s   @r,   r5   r5   `   sM     eD gl)
!LL)
DH)
_c)
	)
 )
r+   r5   c                   @   ^  \ rS rSrSrU 4S jrSS jrSS jrSrU =r	$ )	DPTViTEmbeddings   z:
Construct the CLS token, position and patch embeddings.

c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        Xl        g )Nr   )r<   r=   r
   rN   r'   rO   rA   rP   DPTViTPatchEmbeddingspatch_embeddingsrU   rQ   Dropouthidden_dropout_probdropoutrS   )rR   rS   rU   rX   s      r,   r=   DPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r+   c                 l   US S 2S U24   nUSUS 24   n[        UR                  S5      S-  5      nUR                  SXwS5      R                  SSSS5      n[        R
                  R                  XbU4SS9nUR                  SSSS5      R                  SX#-  S5      n[        R                  " XV/SS	9nU$ r[   )	r   r_   rc   rd   r
   re   rf   r'   rg   rh   s           r,   rp   "DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]2NVVWXZ[]^`abmm//UdBelv/w!))!Q15==aAQAceghJ4!<r+   c                    UR                   u  p4pVU R                  R                  nU R                  U R                  XW-  Xg-  5      nU R                  U5      n	U	R                  5       u  p:nU R                  R                  USS5      n[        R                  " X4SS9n	X-   n	U R                  U	5      n	U(       d  U	4$ [        U	S9$ )Nr8   r   ra   )r   )ry   rS   r?   rp   rQ   r   r_   rP   r}   r'   rg   r   r   )rR   rr   rt   r~   r@   r   r   r?   rQ   r   seq_len_r   s                r,   r   DPTViTEmbeddings.forward   s    2>2D2D/
& [[++
"44$$f&:E<O
 **<8
!+!2
Q ^^**:r2>
YY
7Q?
  5
\\*-
= 9ZXXr+   )rP   rS   r   r   rQ   r   )F)
r"   r#   r$   r%   r&   r=   rp   r   r*   r   r   s   @r,   r   r      s    
Y Yr+   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r      z
Image to Patch Embedding.

c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )r;   stride)r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rU   r
   rL   rM   )rR   rS   r>   r?   r@   rA   rU   rX   s          r,   r=   DPTViTPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir+   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nrw   r]   r   )ry   r@   rI   rM   r{   r|   )rR   rr   r~   r@   r   r   r   s          r,   r   DPTViTPatchEmbeddings.forward  s\    2>2D2D/
&,,,w  __\2::1=GG1M
r+   )r>   r@   rU   r?   rM   	r"   r#   r$   r%   r&   r=   r   r*   r   r   s   @r,   r   r      s    
j r+   r   modulequerykeyvalueattention_maskscalingr   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr8   r9   )rb   dtype)ptrainingr   r]   )r'   matmulr|   r
   re   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r,   eager_attention_forwardr     s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r+   c            
          ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr SS\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )DPTSelfAttentioni<  rS   ru   Nc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r<   r=   rA   num_attention_headshasattrrI   rS   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr
   Linearqkv_biasr   r   r   rR   rS   rX   s     r,   r=   DPTSelfAttention.__init__=  sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r+   xc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr8   r   r]   r   r   )r_   r   r   viewrd   )rR   r   new_x_shapes      r,   transpose_for_scores%DPTSelfAttention.transpose_for_scoresQ  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r+   	head_maskoutput_attentionsc                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n[        nU R
                  R                  S:w  aT  U R
                  R                  S:X  a  U(       a  [        R                  S5        O[        U R
                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pUR                  5       S S U R                  4-   n
UR!                  U
5      nU(       a  X4nU$ U4nU$ )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r9   )r   r   r   r   r   rS   _attn_implementationloggerwarning_oncer   r   r   r   r   r_   r   rc   )rR   r2   r   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss               r,   r   DPTSelfAttention.forwardV  s9    --dhh}.EF	//

=0IJ//

=0IJ(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r+   )
r   r   rS   r   r   r   r   r   r   r   NF)r"   r#   r$   r%   r   r=   r'   r   r   r   r   r	   r   r   r*   r   r   s   @r,   r   r   <  s    ]y ]T ](%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F! !r+   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )DPTViTSelfOutputi{  z
The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rS   ru   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	r<   r=   r
   r   rA   denser   r   r   r   s     r,   r=   DPTViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r+   r2   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rR   r2   r   s      r,   r   DPTViTSelfOutput.forward  s$    

=1]3r+   r   )r"   r#   r$   r%   r&   r   r=   r'   r   r   r*   r   r   s   @r,   r   r   {  sI    
>y >T >
U\\  RWR^R^  r+   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )DPTViTAttentioni  rS   ru   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r<   r=   r   	attentionr   outputsetpruned_headsr   s     r,   r=   DPTViTAttention.__init__  s0    )&1&v.Er+   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   ra   )rH   r   r   r   r   r   r   r   r   r   r   r   r   union)rR   r   r   s      r,   prune_headsDPTViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r+   r2   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )rR   r2   r   r   self_outputsattention_outputr   s          r,   r   DPTViTAttention.forward  sC     ~~m@QR;;|AF#%QR(88r+   )r   r   r   r   )r"   r#   r$   r%   r   r=   r   r   r   r'   r   r   r   r	   r   r   r*   r   r   s   @r,   r   r     s    "y "T ";S ;d ;, -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r+   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DPTViTIntermediatei  rS   ru   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r<   r=   r
   r   rA   intermediate_sizer   rB   
hidden_actstrr   intermediate_act_fnr   s     r,   r=   DPTViTIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r+   r2   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  )rR   r2   s     r,   r   DPTViTIntermediate.forward  s&    

=100?r+   r	  r"   r#   r$   r%   r   r=   r'   r   r   r*   r   r   s   @r,   r  r    s6    9y 9T 9U\\ ell  r+   r  c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )
DPTViTOutputi  rS   ru   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r<   r=   r
   r   r  rA   r   r   r   r   r   s     r,   r=   DPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   r2   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r,   r   DPTViTOutput.forward  s,    

=1]3%4r+   r   r  r   s   @r,   r  r    sD    >y >T >
U\\  RWR^R^  r+   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )DPTViTLayeri  z?This corresponds to the Block class in the timm implementation.rS   ru   Nc                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r<   r=   chunk_size_feed_forwardseq_len_dimr   r   r  intermediater  r   r
   	LayerNormrA   layer_norm_epslayernorm_beforelayernorm_afterr   s     r,   r=   DPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr+   r2   r   r   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r  r  r  r   )rR   r2   r   r   self_attention_outputsr   r   layer_outputs           r,   r   DPTViTLayer.forward  s     "&!!-0/ "0 "

 2!4(, )8 ++M:((6 {{<?/G+r+   )r   r  r  r  r  r   r  r   )r"   r#   r$   r%   r&   r   r=   r'   r   r   r   r	   r   r   r*   r   r   s   @r,   r  r    s    I[y [T [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r+   r  c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )DPTViTEncoderi  rS   ru   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r<   r=   rS   r
   
ModuleListrangenum_hidden_layersr  layergradient_checkpointingrR   rS   r   rX   s      r,   r=   DPTViTEncoder.__init__  sR    ]]vG_G_A`#aA`AK$7A`#ab
&+# $bs   A&r2   r   r   r   rt   c                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr!   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r!   ).0vs     r,   	<genexpr>(DPTViTEncoder.forward.<locals>.<genexpr>.  s     m$[q$[s   	)r0   r2   r3   )	enumerater)  r*  r   _gradient_checkpointing_func__call__tupler   )rR   r2   r   r   r   rt   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r,   r   DPTViTEncoder.forward
  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r+   )rS   r*  r)  )NFFT)r"   r#   r$   r%   r   r=   r'   r   r   r   r	   r6  r   r   r*   r   r   s   @r,   r$  r$    s    ,y ,T , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
 )
r+   r$  c                      ^  \ rS rSrSrU 4S jrS rS rS
S\\	R                     S\\	R                     4S jjrS	rU =r$ )DPTReassembleStagei6  a  
This class reassembles the hidden states of the backbone into image-like feature representations at various
resolutions.

This happens in 3 stages:
1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
   `config.readout_type`.
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
3. Resizing the spatial dimensions (height, width).

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
c                    > [         TU ]  5         Xl        [        R                  " 5       U l        UR                  (       a  U R                  U5        OU R                  U5        UR                  U l	        g r   )
r<   r=   rS   r
   r&  layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r,   r=   DPTReassembleStage.__init__F  sS    mmo,,V4%%f-"(";";r+   c           	         [        [        [        UR                  5      5      UR                  5       Hs  u  p#US::  a0  U R
                  R                  [        R                  " 5       5        M;  US:  d  MC  U R
                  R                  [        XR                  U   US95        Mu     UR                  S:w  a  [        SUR                   S35      e[        R                  " 5       U l        [        U5      n[        [        UR                  5      5       H  nUS::  aD  U R                  R                  [        R                  " [        R                  " 5       5      5        MM  US:  d  MU  U R                  R                  [        R                  " [        R                   " SU-  U5      ["        UR$                     5      5        M     g)z"
For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
for more details.
r   rG   factorprojectzReadout type z! is not supported for DPT-Hybrid.r]   N)zipr'  rH   neck_hidden_sizesreassemble_factorsrA  appendr
   IdentityDPTReassembleLayerreadout_typerI   r&  readout_projects_get_backbone_hidden_size
Sequentialr   r   r  )rR   rS   r9  rI  rA   s        r,   rC  .DPTReassembleStage._init_reassemble_dpt_hybridR  sI    U3v'?'?#@A6C\C\]IAAv""2;;=1Q""#5fG_G_`aGbkq#rs	 ^ )+}V-@-@,AAbcdd !#/7s63345AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde	 6r+   c           	      B   [        [        [        UR                  5      5      UR                  5       H5  u  p#U R
                  R                  [        XR                  U   US95        M7     UR                  S:X  a  [        R                  " 5       U l        [        U5      n[        [        UR                  5      5       H\  nU R                  R                  [        R                  " [        R                  " SU-  U5      [        UR                      5      5        M^     g g )NrH  rJ  r]   )rK  r'  rH   rL  rM  rA  rN  rP  rQ  r
   r&  rR  rS  rT  r   r   r  )rR   rS   r9  rI  rA   r   s         r,   rD  'DPTReassembleStage._init_reassemble_dptl  s    U3v'?'?#@A6C\C\]IAKK1&C[C[\]C^gmno ^ )+$&MMOD!3F;K3v7789%%,,MM"))AO["I6RXRcRcKde : ,r+   r2   ru   c                    / n[        U5       GH  u  pVXPR                  ;  Ga  USS2S4   USS2SS24   pgUR                  u  pn
Ub  Ub  UR                  XX:5      nO [	        U	S-  5      nUR                  XX5      nUR                  SSSS5      R                  5       nUR                  nU R                  R                  S:X  a  UR                  S5      R                  S5      nUR                  S5      R                  U5      nU R                  U   " [        R                  " Xm4S	5      5      nUR                  SSS5      R                  U5      nONU R                  R                  S
:X  a4  UR                  S5      UR                  S	5      -   nUR                  U5      nU R                  U   " U5      nUR!                  U5        GM     U$ )z
Args:
    hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
        List of hidden states from the backbone.
Nr   r   r\   r   r]   rJ  )r   r]   r   r8   add)r3  rE  ry   rc   r   rd   r   rS   rQ  r{   	unsqueeze	expand_asrR  r'   rg   rA  rN  )rR   r2   patch_heightpatch_widthoutr9  hidden_staterP   r~   sequence_lengthr@   r_   feature_shapereadouts                 r,   r   DPTReassembleStage.forwardx  s    (7OA///*6q!t*<l1ab5>Q<<H<N<N9
\+0G#/#7#7
R]#lL$_c%9:D#/#7#7
$#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#'#8#8#;EII|F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL#{{1~l;JJ|$3  86 
r+   )rS   rA  rE  rR  NN)r"   r#   r$   r%   r&   r=   rC  rD  r   r'   r   r   r*   r   r   s   @r,   r?  r?  6  sE    
<4
#T%,,%7 #aefkfrfras # #r+   r?  c                 ~    U R                   b%  U R                  SL a  U R                   R                  $ U R                  $ r   )backbone_configrB  rA   )rS   s    r,   rS  rS    s9    )f.>.>%.G%%111!!!r+   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )rP  i  c           	      P  > [         TU ]  5         [        U5      n[        R                  " XBSS9U l        US:  a  [        R                  " X"X3SS9U l        g US:X  a  [        R                  " 5       U l        g US:  a)  [        R                  " X"S[        SU-  5      SS9U l        g g )Nr   )in_channelsout_channelsr;   r   r;   r   paddingr   )
r<   r=   rS  r
   rL   rM   ConvTranspose2dresizerO  r   )rR   rS   rG   rI  rA   rX   s        r,   r=   DPTReassembleLayer.__init__  s    /7))`ab A:,,XVlmnDKq[++-DKaZ))HAcRSV\R\oghiDK r+   c                 J    U R                  U5      nU R                  U5      nU$ r   rM   rn  )rR   r_  s     r,   r   DPTReassembleLayer.forward  s$    |4{{<0r+   rq  r"   r#   r$   r%   r=   r   r*   r   r   s   @r,   rP  rP    s    j r+   rP  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DPTFeatureFusionStagei  c                    > [         TU ]  5         [        R                  " 5       U l        [        [        UR                  5      5       H'  nU R                  R                  [        U5      5        M)     g r   )
r<   r=   r
   r&  rA  r'  rH   rL  rN  DPTFeatureFusionLayerr+  s      r,   r=   DPTFeatureFusionStage.__init__  sM    mmos63345AKK4V<= 6r+   c                     US S S2   n/ nS n[        XR                  5       H*  u  pEUc	  U" U5      nOU" X45      nUR                  U5        M,     U$ )Nr8   )rK  rA  rN  )rR   r2   fused_hidden_statesfused_hidden_stater_  r)  s         r,   r   DPTFeatureFusionStage.forward  sg    %dd+ !#&}kk#BL!)%*<%8"%*+=%L"&&'9: $C #"r+   )rA  rs  r   s   @r,   ru  ru    s    ># #r+   ru  c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )DPTPreActResidualLayeri  z
ResidualConvUnit, pre-activate residual unit.

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
c           	        > [         TU ]  5         UR                  U l        UR                  b  UR                  OU R                  (       + n[
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l
        [
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l        U R                  (       aK  [
        R                  " UR                  5      U l        [
        R                  " UR                  5      U l        g g )Nr   r   )r;   r   rl  r   )r<   r=   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr
   ReLUactivation1rL   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rR   rS   r  rX   s      r,   r=   DPTPreActResidualLayer.__init__  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r+   r_  ru   c                    UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nU R	                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   $ r   )r  r  r  r  r  r  r  rR   r_  residuals      r,   r   DPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9L&&r+   )r  r  r  r  r  r  r  )r"   r#   r$   r%   r&   r=   r'   r   r   r*   r   r   s   @r,   r~  r~    s/     ID'ELL 'U\\ ' 'r+   r~  c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )rw  i  a  Feature fusion layer, merges feature maps from different stages.

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
    align_corners (`bool`, *optional*, defaults to `True`):
        The align_corner setting for bilinear upsample.
c                    > [         TU ]  5         X l        [        R                  " UR
                  UR
                  SSS9U l        [        U5      U l        [        U5      U l	        g )Nr   T)r;   r   )
r<   r=   align_cornersr
   rL   r  rM   r~  residual_layer1residual_layer2)rR   rS   r  rX   s      r,   r=   DPTFeatureFusionLayer.__init__  sR    *))F$=$=v?X?Xfgnrs5f=5f=r+   c                 t   Ubh  UR                   UR                   :w  a;  [        R                  R                  X!R                   S   UR                   S   4SSS9nXR	                  U5      -   nU R                  U5      n[        R                  R                  USSU R                  S9nU R                  U5      nU$ )Nr]   r   r^   Fr_   r`   r  scale_factorr`   r  )ry   r
   re   rf   r  r  r  rM   r  s      r,   r   DPTFeatureFusionLayer.forward$  s    !!X^^3==44$6$6q$9<;M;Ma;P#QXbrw 5  (*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r+   )r  rM   r  r  Tr   r   r   s   @r,   rw  rw    s    > r+   rw  c                   2    \ rS rSr\rSrSrSrSr	Sr
S rSrg)DPTPreTrainedModeli5  dptrr   Tc                    [        U[        R                  [        R                  [        R                  45      (       aj  UR
                  R                  R                  SU R                  R                  S9  UR                  b$  UR                  R                  R                  5         Ox[        U[        R                  [        R                  45      (       aI  UR                  R                  R                  5         UR
                  R                  R                  S5        [        U[        [         45      (       aI  UR"                  R                  R                  5         UR$                  R                  R                  5         gg)zInitialize the weightsr   )meanstdNg      ?)rB   r
   r   rL   rm  weightdatanormal_rS   initializer_ranger   zero_r  r  fill_r   r5   rP   rQ   )rR   r   s     r,   _init_weights DPTPreTrainedModel._init_weights>  s    fryy"))R5G5GHII MM&&CT[[5R5R&S{{&  &&(r~~ >??KK""$MM$$S)f/1GHII!!'')&&++113 Jr+   r!   N)r"   r#   r$   r%   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2r  r*   r!   r+   r,   r  r  5  s(    L$O&*#N!4r+   r  c                      ^  \ rS rSrSU 4S jjrS rS r\    SS\R                  S\
\R                     S\
\   S\
\   S	\
\   S
\\\4   4S jj5       rSrU =r$ )DPTModeliN  c                 b  > [         TU ]  U5        Xl        UR                  (       a  [	        U5      U l        O[        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R!                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r  N)r<   r=   rS   rB  r5   r   r   r$  encoderr
   r  rA   r  	layernormDPTViTPoolerpooler	post_init)rR   rS   add_pooling_layerrX   s      r,   r=   DPTModel.__init__P  s    
 	  4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r+   c                 |    U R                   R                  (       a  U R                  $ U R                  R                  $ r   )rS   rB  r   r   )rR   s    r,   get_input_embeddingsDPTModel.get_input_embeddingse  s)    ;;  ??"??333r+   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r)  r   r   )rR   heads_to_pruner)  r   s       r,   _prune_headsDPTModel._prune_headsk  s<    
 +002LELLu%//;;EB 3r+   rr   r   r   r   rt   ru   c                 ^   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  X R                   R
                  5      nU R                  XS9nU(       d  US   OUR                  nU R                  UUUUUS9nUS   n	U R                  U	5      n	U R                  b  U R                  U	5      OS n
U(       d  U
b  X4OU	4nXSS  -   USS  -   $ [        U	U
UR                  UR                  UR                  S9$ )N)rt   r   r   r   r   rt   r   )r0   r1   r2   r3   r    )rS   r   r   use_return_dictget_head_maskr(  r   r   r  r  r  r.   r2   r3   r    )rR   rr   r   r   r   rt   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputss               r,   r   DPTModel.forwards  sM    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y++2O2OP	??<?QBM'7':ScSvSv$,,(/!5# ' 
 *!,..98<8OO4UY?L?XO;_n^pL!""558H8LLLC-')77&11%5%N%N
 	
r+   )rS   r   r  r  r  r  )NNNN)r"   r#   r$   r%   r=   r  r  r   r'   r(   r   r   r	   r   r.   r   r*   r   r   s   @r,   r  r  N  s    *4C  26,0/3&*/
''/
 E--./
 $D>	/

 'tn/
 d^/
 
uJJ	K/
 /
r+   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )r  i  rS   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r<   r=   r
   r   rA   pooler_output_sizer   r   
pooler_act
activationr   s     r,   r=   DPTViTPooler.__init__  s>    YYv1163L3LM
 !2!23r+   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )rR   r2   first_token_tensorr  s       r,   r   DPTViTPooler.forward  s6     +1a40

#566r+   )r  r   )	r"   r#   r$   r%   r   r=   r   r*   r   r   s   @r,   r  r    s    4y 4
 r+   r  c                   v   ^  \ rS rSrSrU 4S jrSS\\R                     S\\R                     4S jjr	Sr
U =r$ )	DPTNecki  a  
DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
input and produces another list of tensors as output. For DPT, it includes 2 stages:

* DPTReassembleStage
* DPTFeatureFusionStage.

Args:
    config (dict): config dict.
c                   > [         TU ]  5         Xl        UR                  b"  UR                  R                  S;   a  S U l        O[        U5      U l        [        R                  " 5       U l	        UR                   H=  nU R                  R                  [        R                  " X!R                  SSSS95        M?     [        U5      U l        g )N)swinv2r   r   Fr;   rl  r   )r<   r=   rS   rf  
model_typereassemble_stager?  r
   r&  convsrL  rN  rL   r  ru  fusion_stage)rR   rS   channelrX   s      r,   r=   DPTNeck.__init__  s     !!-&2H2H2S2SWa2a$(D!$6v$>D!]]_
//GJJbii1J1JXYcdkpqr 0 2&9r+   r2   ru   c                    [        U[        [        45      (       d  [        S5      e[	        U5      [	        U R
                  R                  5      :w  a  [        S5      eU R                  b  U R                  XU5      n[        U5       VVs/ s H  u  pEU R                  U   " U5      PM     nnnU R                  U5      nU$ s  snnf )z
Args:
    hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
        List of hidden states from the backbone.
z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)rB   r6  list	TypeErrorrH   rS   rL  rI   r  r3  r  r  )rR   r2   r\  r]  r9  featurer   r   s           r,   r   DPTNeck.forward  s     -%77PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UV=UzqDJJqM'*=UV ""8, Ws   !C)rS   r  r  r  rd  r"   r#   r$   r%   r&   r=   r   r'   r   r   r*   r   r   s   @r,   r  r    s;    	:"T%,,%7 aefkfrfras  r+   r  c                   l   ^  \ rS rSrSrU 4S jrS\\R                     S\R                  4S jr	Sr
U =r$ )DPTDepthEstimationHeadi  z
Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
supplementary material).
c                   > [         TU ]  5         Xl        S U l        UR                  (       a  [
        R                  " SSSSSS9U l        UR                  n[
        R                  " [
        R                  " X"S-  SSSS9[
        R                  " SSS	S
9[
        R                  " US-  SSSSS9[
        R                  " 5       [
        R                  " SSSSSS9[
        R                  " 5       5      U l        g )N   )r   r   )r   r   rk  r]   r   r   r^   Tr      r   )r<   r=   rS   rM   add_projectionr
   rL   r  rT  Upsampler  headrR   rS   r   rX   s      r,   r=   DPTDepthEstimationHead.__init__  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r+   r2   ru   c                     XR                   R                     nU R                  b,  U R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SS9nU$ )Nr   ra   )rS   head_in_indexrM   r
   r  r  squeeze)rR   r2   predicted_depths      r,   r   DPTDepthEstimationHead.forward  sc    %kk&?&?@??& OOM:MGGIm4M))M2)11a18r+   )rS   r  rM   r  r   s   @r,   r  r    s2    
&T%,,%7 ELL  r+   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    )custom_introc                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )DPTForDepthEstimationi  c                 $  > [         TU ]  U5        S U l        UR                  SL a+  UR                  c  UR                  b  [        U5      U l        O[        USS9U l        [        U5      U l	        [        U5      U l        U R                  5         g NF)r  )r<   r=   rF   rB  rf  r   r  r  r  neckr  r  r  r   s     r,   r=   DPTForDepthEstimation.__init__  s}     u$&*@*@*LPVP_P_Pk)&1DM%@DH FO	 +62	 	r+   rr   r   labelsr   r   rt   ru   c                   ^  SnUb  [        S5      eUb  UOT R                  R                  nUb  UOT R                  R                  nUb  UOT R                  R                  nT R
                  b'  T R
                  R                  XUS9nUR                  n	OT R                  UUUSUS9nU(       a  UR                  OUS   n	T R                  R                  (       d?  [        U	SS 5       V
Vs/ s H#  u  pU
T R                  R                  ;   d  M!  UPM%     n	n
nOJU(       a  UR                  O[        US   5      nUR                  U 4S j[        U	SS 5       5       5        Un	S	u  pT R                  R                   bT  T R                  R                  S
L a;  UR"                  u    nnnT R                  R                   R$                  nUU-  nUU-  nT R'                  XU5      n	T R)                  U	5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [+        UUU(       a  UR                  OSUR,                  S9$ s  snn
f )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth depth estimation maps for computing the loss.

Examples:
```python
>>> from transformers import AutoImageProcessor, DPTForDepthEstimation
>>> import torch
>>> import numpy as np
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
>>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # interpolate to original size
>>> post_processed_output = image_processor.post_process_depth_estimation(
...     outputs,
...     target_sizes=[(image.height, image.width)],
... )

>>> # visualize the prediction
>>> predicted_depth = post_processed_output[0]["predicted_depth"]
>>> depth = predicted_depth * 255 / predicted_depth.max()
>>> depth = depth.detach().cpu().numpy()
>>> depth = Image.fromarray(depth.astype("uint8"))
```NzTraining is not implemented yet)r   r   Tr  r   r8   c              3   j   >#    U  H(  u  pUTR                   R                  S S ;   d  M$  Uv   M*     g7fr]   NrS   backbone_out_indicesr/  idxr  rR   s      r,   r1  0DPTForDepthEstimation.forward.<locals>.<genexpr>z  s5      .(Ddkk>>qrBB G(D   #3	3rd  Fr]   )lossr  r2   r3   )NotImplementedErrorrS   r  r   r   rF   forward_with_filtered_kwargsrz   r  r2   rB  r3  r  r    r  extendrf  ry   r?   r  r  r   r3   )rR   rr   r   r  r   r   rt   r  r   r2   r  r  backbone_hidden_statesr\  r]  r   r   r   r?   r  r   s   `                    r,   r   DPTForDepthEstimation.forward,  sp   \ %&GHH%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq==$mm@@[l A G $00Mhh#"3%)'  G 6AG11gajM ;;((09-:K0L!0LPSW[WbWbWwWwPwG0L  ! NY)I)I^bcjkmcn^o&&-- .(1-2C(D.  !7$.!;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#)+gabk9)+gabk9)-)9TGf$EvE#+3G'//T))	
 	
?!s   . II)rF   r  r  r  )NNNNN)r"   r#   r$   r%   r=   r   r'   r(   r   
LongTensorr   r	   r   r   r   r   r*   r   r   s   @r,   r  r    s    $  26-1,0/3&*l
''l
 E--.l
 ))*	l

 $D>l
 'tnl
 d^l
 
uU\\"$88	9l
 l
r+   r  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )DPTSemanticSegmentationHeadi  c                   > [         TU ]  5         Xl        UR                  n[        R
                  " [        R                  " X"SSSS9[        R                  " U5      [        R                  " 5       [        R                  " UR                  5      [        R                  " X!R                  SS9[        R                  " SSSS	95      U l        g )
Nr   r   Fr  r:   r]   r^   Tr  )r<   r=   rS   r  r
   rT  rL   r  r  r   semantic_classifier_dropout
num_labelsr  r  r  s      r,   r=   $DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r+   r2   ru   c                 X    XR                   R                     nU R                  U5      nU$ r   )rS   r  r  rR   r2   logitss      r,   r   #DPTSemanticSegmentationHead.forward  s'    %kk&?&?@=)r+   )rS   r  )r"   r#   r$   r%   r=   r   r'   r   r   r*   r   r   s   @r,   r  r    s-    
T%,,%7 ELL  r+   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DPTAuxiliaryHeadi  c                 T  > [         TU ]  5         UR                  n[        R                  " [        R
                  " X"SSSS9[        R                  " U5      [        R                  " 5       [        R                  " SS5      [        R
                  " X!R                  SS95      U l
        g )Nr   r   Fr  g?r:   )r<   r=   r  r
   rT  rL   r  r  r   r  r  r  s      r,   r=   DPTAuxiliaryHead.__init__  sr    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r+   c                 (    U R                  U5      nU$ r   r  r  s      r,   r   DPTAuxiliaryHead.forward  s    =)r+   r  rs  r   s   @r,   r  r    s    

 r+   r  c                      ^  \ rS rSrU 4S jr\      SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )DPTForSemanticSegmentationi  c                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        [        U5      U l        UR                  (       a  [        U5      OS U l
        U R                  5         g r  )r<   r=   r  r  r  r  r  r  use_auxiliary_headr  auxiliary_headr  r   s     r,   r=   #DPTForSemanticSegmentation.__init__  s^     Fe< FO	 07	:@:S:S.v6Y] 	r+   rr   r   r  r   r   rt   ru   c                   ^  Ub  UOT R                   R                  nUb  UOT R                   R                  nUb%  T R                   R                  S:X  a  [	        S5      eT R                  UUUSUS9nU(       a  UR                  OUS   nT R                   R                  (       d?  [        USS 5       V	V
s/ s H#  u  pU	T R                   R                  ;   d  M!  U
PM%     nn	n
OJU(       a  UR                  O[        US   5      nUR                  U 4S j[        USS 5       5       5        UnT R                  US9nT R                  U5      nSnT R                  b  T R                  US   5      nSnUb  [         R"                  R%                  XR&                  S	S S
SS9nUb,  [         R"                  R%                  XR&                  S	S S
SS9n[)        T R                   R*                  S9nU" X5      nU" WU5      nUT R                   R,                  U-  -   nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [/        UUU(       a  UR                  OSUR0                  S9$ s  sn
n	f )aL  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:
```python
>>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
>>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTr  r8   c              3   j   >#    U  H(  u  pUTR                   R                  S S ;   d  M$  Uv   M*     g7fr  r  r  s      r,   r1  5DPTForSemanticSegmentation.forward.<locals>.<genexpr>  s6      *,HLCCSWS^S^SsSstutvSwLw,Hr  )r2   r9   r^   Fr  )ignore_indexr]   )r  r  r2   r3   )rS   r  r   r  rI   r  r2   rB  r3  r  r    r  r  r  r  r$  r
   re   rf   ry   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r3   )rR   rr   r   r  r   r   rt   r   r2   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   s   `                    r,   r   "DPTForSemanticSegmentation.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO((/!%#  
 2=--'!* {{$$,5mAB6G,H,HLCCSWS^S^SsSsLs,H  M JUW%E%EZ^_fgi_jZk"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88\\"#.Zu  9    +-/]]-F-F$<<+<:]b .G .* (T[[5[5[\H !1:I%&@&INt{{@@>QQD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
Us   7 I1I1)r$  r  r  r  )NNNNNN)r"   r#   r$   r%   r=   r   r   r'   r(   r  r   r	   r   r   r   r   r*   r   r   s   @r,   r!  r!    s      5915-1,0/3&*c
u001c
 E--.c
 ))*	c

 $D>c
 'tnc
 d^c
 
uU\\"$;;	<c
 c
r+   r!  )r  r!  r  r  )r   )Ir&   collections.abcrC   dataclassesr   typingr   r   r   r   r   r	   r'   torch.utils.checkpointr
   torch.nnr   activationsr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr"   r   r   r.   Moduler5   r   r   r   floatr   r   r   r   r  r  r  r$  r?  rS  rP  ru  r~  rw  r  r  r  r  r  r  r  r  r!  __all__r!   r+   r,   <module>rC     s    ! > >    % ! ^ ^ F Q D D 1 ( 
		H	% M M M   M;  M  MF`
RYY `
F7Yryy 7YtBII N %II%<<% 
% <<	%
 U\\*% % %>;ryy ;~ryy $&bii &T "299  '")) 'V0
BII 0
fe eP" ,#BII #0:'RYY :'z"BII "J 4 4 40 T
! T
 T
p299 2bii 2j&RYY &R 
@
. @

@
F")) 2ryy & t
!3 t
 t
n dr+   