
    fTh~                        S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  \R4                  " \5      r\ " S S\5      5       r " S S\R<                  5      r " S S\R<                  5      r  " S S\R<                  5      r! " S S\R<                  5      r" " S S\R<                  5      r# " S S\R<                  5      r$ " S S\R<                  5      r%S@S jr& " S S \R<                  5      r' " S! S"\R<                  5      r( " S# S$\R<                  5      r)\	RT                  RV                  SAS%\,S&\-4S' jj5       r. " S( S)\R<                  5      r/ " S* S+\R<                  5      r0 " S, S-\R<                  5      r1 " S. S/\R<                  5      r2 " S0 S1\R<                  5      r3 " S2 S3\R<                  5      r4 " S4 S5\R<                  5      r5 " S6 S7\R<                  5      r6 " S8 S9\R<                  5      r7\ " S: S;\5      5       r8\" S<S=9 " S> S?\85      5       r9S?S;/r:g)BzPyTorch ZoeDepth model.    N)	dataclass)ListOptionalTupleUnion)nn   )ACT2FN)DepthEstimatorOutput)PreTrainedModel)ModelOutputauto_docstringlogging)load_backbone   )ZoeDepthConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)ZoeDepthDepthEstimatorOutput$   a  
Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Predicted depth for each pixel.

    domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
        Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.

    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlosspredicted_depthdomain_logits.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   __static_attributes__r       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/zoedepth/modeling_zoedepth.pyr   r   $   s    2 )-D(5$$
%,37OXe//0715M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>r%   r   c                   r   ^  \ rS rSrSrU 4S jrS\\R                     S\\R                     4S jr	Sr
U =r$ )ZoeDepthReassembleStageF   a  
This class reassembles the hidden states of the backbone into image-like feature representations at various
resolutions.

This happens in 3 stages:
1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
   `config.readout_type`.
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
3. Resizing the spatial dimensions (height, width).

Args:
    config (`[ZoeDepthConfig]`):
        Model configuration class defining the model architecture.
c           	      V  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        [        UR                  UR                  5       H(  u  p#U R
                  R                  [        XUS95        M*     UR                  S:X  a  [        R                  " 5       U l        UR                  nUR                   H\  nU R                  R                  [        R                  " [        R                  " SU-  U5      [        UR                      5      5        M^     g g )N)channelsfactorproject   )super__init__readout_typer   
ModuleListlayerszipneck_hidden_sizesreassemble_factorsappendZoeDepthReassembleLayerreadout_projectsbackbone_hidden_size
SequentialLinearr
   
hidden_act)selfconfigneck_hidden_sizer,   hidden_size_	__class__s         r&   r0    ZoeDepthReassembleStage.__init__V   s    "//mmo(+F,D,DfF_F_(`$KK6vaghi )a )+$&MMOD! 55K--%%,,MM"))AO["I6RXRcRcKde . ,r%   r   returnc                 :   US   R                   S   n[        R                  " USS9nUSS2S4   USS2SS24   pUR                   u  pgnUR                  XbX85      nUR	                  SSSS5      R                  5       nU R                  S:X  aW  UR                  S5      R	                  S5      nUR                  SS9R                  U5      n	[        R                  " X4S	5      nO#U R                  S
:X  a  XR                  S	5      -   n/ n
[        UR                  USS95       Ht  u  pU R                  S:X  a  U R                  U   " U5      nUR	                  SSS5      R                  US	X#5      nU R                  U   " U5      nU
R                  U5        Mv     U
$ )z
Args:
    hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
        List of hidden states from the backbone.
r   dimNr   r	   r.   r-   )r   r.   r   add)shaper!   catreshapepermute
contiguousr1   flatten	unsqueeze	expand_as	enumeratesplitr9   r3   r7   )r>   r   patch_heightpatch_width
batch_size	cls_tokentotal_batch_sizesequence_lengthnum_channelsreadoutout	stage_idxhidden_states                r&   forwardZoeDepthReassembleStage.forwardg   s    #1%++A.
 		-Q7#0A#6ae8L=:G:M:M7<%--.>kh%--aAq9DDF	))11!4<<YGM))a)0::=IG "II}&>CM%'),?,?,CCM'01D1DZUV1D1W'X#I  I-#44Y?M (//1a8@@RQ]kL;;y1,?LJJ|$ (Y 
r%   )r3   r9   r1   r   r   r   r   r    r0   r   r!   Tensorr`   r$   __classcell__rC   s   @r&   r(   r(   F   s;    "&T%,,%7 &W[\a\h\hWi & &r%   r(   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r8      c           	      R  > [         TU ]  5         UR                  n[        R                  " XBSS9U l        US:  a  [        R                  " X"X3SS9U l        g US:X  a  [        R                  " 5       U l        g US:  a)  [        R                  " X"S[        SU-  5      SS9U l        g g )Nr   )in_channelsout_channelskernel_sizer   rk   stridepaddingr	   )
r/   r0   r:   r   Conv2d
projectionConvTranspose2dresizeIdentityint)r>   r?   r+   r,   rA   rC   s        r&   r0    ZoeDepthReassembleLayer.__init__   s    11))`ab A:,,XVlmnDKq[++-DKaZ))HAcRSV\R\oghiDK r%   c                 J    U R                  U5      nU R                  U5      nU$ Nrp   rr   r>   r_   s     r&   r`   ZoeDepthReassembleLayer.forward   s$    |4{{<0r%   rx   r   r   r   r   r0   r`   r$   rd   re   s   @r&   r8   r8      s    j  r%   r8   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ZoeDepthFeatureFusionStage   c                    > [         TU ]  5         [        R                  " 5       U l        [        [        UR                  5      5       H'  nU R                  R                  [        U5      5        M)     g rw   )
r/   r0   r   r2   r3   rangelenr5   r7   ZoeDepthFeatureFusionLayer)r>   r?   rB   rC   s      r&   r0   #ZoeDepthFeatureFusionStage.__init__   sM    mmos63345AKK9&AB 6r%   c                     US S S2   n/ nS n[        XR                  5       H*  u  pEUc	  U" U5      nOU" X45      nUR                  U5        M,     U$ )NrI   )r4   r3   r7   )r>   r   fused_hidden_statesfused_hidden_stater_   layers         r&   r`   "ZoeDepthFeatureFusionStage.forward   sg    %dd+ !#&}kk#BL!)%*<%8"%*+=%L"&&'9: $C #"r%   )r3   r{   re   s   @r&   r}   r}      s    C# #r%   r}   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )ZoeDepthPreActResidualLayer   z
ResidualConvUnit, pre-activate residual unit.

Args:
    config (`[ZoeDepthConfig]`):
        Model configuration class defining the model architecture.
c           	        > [         TU ]  5         UR                  U l        UR                  b  UR                  OU R                  (       + n[
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l
        [
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l        U R                  (       a]  [
        R                  " UR                  UR                  S9U l        [
        R                  " UR                  UR                  S9U l        g g )Nr	   r   )rk   rm   rn   bias)eps)r/   r0   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1ro   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm_epsbatch_norm1batch_norm2)r>   r?   r   rC   s      r&   r0   $ZoeDepthPreActResidualLayer.__init__   s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GVMbMbcD!~~f.G.GVMbMbcD r%   r_   rE   c                    UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nU R	                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   $ rw   )r   r   r   r   r   r   r   r>   r_   residuals      r&   r`   #ZoeDepthPreActResidualLayer.forward   s    ''5((6++L9L''5((6++L9L&&r%   )r   r   r   r   r   r   r   )r   r   r   r   r    r0   r!   rc   r`   r$   rd   re   s   @r&   r   r      s/     dD'ELL 'U\\ ' 'r%   r   c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )r      a   Feature fusion layer, merges feature maps from different stages.

Args:
    config (`[ZoeDepthConfig]`):
        Model configuration class defining the model architecture.
    align_corners (`bool`, *optional*, defaults to `True`):
        The align_corner setting for bilinear upsample.
c                    > [         TU ]  5         X l        [        R                  " UR
                  UR
                  SSS9U l        [        U5      U l        [        U5      U l	        g )Nr   T)rk   r   )
r/   r0   align_cornersr   ro   r   rp   r   residual_layer1residual_layer2)r>   r?   r   rC   s      r&   r0   #ZoeDepthFeatureFusionLayer.__init__
  sR    *))F$=$=v?X?Xfgnrs:6B:6Br%   c                 t   Ubh  UR                   UR                   :w  a;  [        R                  R                  X!R                   S   UR                   S   4SSS9nXR	                  U5      -   nU R                  U5      n[        R                  R                  USSU R                  S9nU R                  U5      nU$ )Nr.   r	   bilinearFsizemoder   scale_factorr   r   )rK   r   
functionalinterpolater   r   r   rp   r   s      r&   r`   "ZoeDepthFeatureFusionLayer.forward  s    !!X^^3==44$6$6q$9<;M;Ma;P#QXbrw 5  (*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r%   )r   rp   r   r   )Trw   	r   r   r   r   r    r0   r`   r$   rd   re   s   @r&   r   r      s    C r%   r   c                   r   ^  \ rS rSrSrU 4S jrS\\R                     S\\R                     4S jr	Sr
U =r$ )ZoeDepthNecki%  a3  
ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:

* ZoeDepthReassembleStage
* ZoeDepthFeatureFusionStage.

Args:
    config (dict): config dict.
c                   > [         TU ]  5         Xl        UR                  b"  UR                  R                  S;   a  S U l        O[        U5      U l        [        R                  " 5       U l	        UR                   H=  nU R                  R                  [        R                  " X!R                  SSSS95        M?     [        U5      U l        g )N)swinv2r	   r   F)rk   rn   r   )r/   r0   r?   backbone_config
model_typereassemble_stager(   r   r2   convsr5   r7   ro   r   r}   fusion_stage)r>   r?   channelrC   s      r&   r0   ZoeDepthNeck.__init__2  s     !!-&2H2H2S2SWa2a$(D!$;F$CD!]]_
//GJJbii1J1JXYcdkpqr 0 7v>r%   r   rE   c                    [        U[        [        45      (       d  [        S5      e[	        U5      [	        U R
                  R                  5      :w  a  [        S5      eU R                  b  U R                  XU5      n[        U5       VVs/ s H  u  pEU R                  U   " U5      PM     nnnU R                  U5      nXvS   4$ s  snnf )z
Args:
    hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
        List of hidden states from the backbone.
z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.rI   )
isinstancetuplelist	TypeErrorr   r?   r5   
ValueErrorr   rS   r   r   )r>   r   rU   rV   ifeaturefeaturesoutputs           r&   r`   ZoeDepthNeck.forwardC  s     -%77PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UV=UzqDJJqM'*=UV ""8,|## Ws   !C)r?   r   r   r   rb   re   s   @r&   r   r   %  s;    	?"$T%,,%7 $W[\a\h\hWi $ $r%   r   c                   l   ^  \ rS rSrSrU 4S jrS\\R                     S\R                  4S jr	Sr
U =r$ )#ZoeDepthRelativeDepthEstimationHeadi[  a  
Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
supplementary material).
c                   > [         TU ]  5         UR                  U l        S U l        UR                  (       a  [
        R                  " SSSSSS9U l        UR                  n[
        R                  " X"S-  SSSS9U l        [
        R                  " SSS	S
9U l
        [
        R                  " US-  UR                  SSSS9U l        [
        R                  " UR                  SSSSS9U l        g )Nr   )r	   r	   )r   r   rl   r.   r	   r   r   Tr   r   )r/   r0   head_in_indexrp   add_projectionr   ro   r   conv1Upsampleupsamplenum_relative_featuresconv2conv3)r>   r?   r   rC   s      r&   r0   ,ZoeDepthRelativeDepthEstimationHead.__init__b  s    #11   iiSfV]cdDO,,YYxQAaYZ[
SWXYYx1}f.J.JXYbcmno
YYv;;QAVWabc
r%   r   rE   c                    XR                      nU R                  b,  U R                  U5      n[        R                  " 5       " U5      nU R	                  U5      nU R                  U5      nU R                  U5      n[        R                  " 5       " U5      nUnU R                  U5      n[        R                  " 5       " U5      nUR                  SS9nX24$ )Nr   rG   )	r   rp   r   r   r   r   r   r   squeeze)r>   r   r   r   s       r&   r`   +ZoeDepthRelativeDepthEstimationHead.forwardq  s    %&8&89??& OOM:MGGIm4M

=1m4

=1	-0 

=1	-0'//A/6((r%   )r   r   r   r   rp   r   rb   re   s   @r&   r   r   [  s3    d)T%,,%7 )ELL ) )r%   r   c                     X-   n X-   nU [         R                  " U 5      -  U[         R                  " U5      -  -
  X-
  [         R                  " X-
  U-   5      -  -
  $ )z%log(nCk) using stirling approximation)r!   log)nkr   s      r&   	log_binomr     sP    	A	Auyy|a%))A,..!%599QUS[;Q1QQQr%   c                   N   ^  \ rS rSrS\R
                  4U 4S jjrSS jrSrU =r	$ )LogBinomialSoftmaxi  r   c           	      4  > [         TU ]  5         Xl        X l        U R	                  S[
        R                  " SU5      R                  SSSS5      SS9  U R	                  S[
        R                  " U R                  S-
  /5      R                  SSSS5      SS9  g)	a  Compute log binomial distribution for n_classes

Args:
    n_classes (`int`, *optional*, defaults to 256):
        Number of output classes.
    act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
        Activation function to apply to the output.
k_idxr   r   rI   F)
persistent	k_minus_1N)	r/   r0   r   actregister_bufferr!   arangeviewtensor)r>   	n_classesr   rC   s      r&   r0   LogBinomialSoftmax.__init__  s     	Well1i&@&E&EaQPQ&R_de[%,,
|*D*I*I!RQRTU*Vchir%   c                    UR                   S:X  a  UR                  S5      n[        R                  " SU-
  US5      n[        R                  " XS5      n[	        U R
                  U R                  5      U R                  [        R                  " U5      -  -   U R
                  U R                  -
  [        R                  " U5      -  -   nU R                  XR-  SS9$ )aX  Compute the log binomial distribution for probabilities.

Args:
    probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
        Tensor containing probabilities of each class.
    temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
        Temperature of distribution.
    eps (`float`, *optional*, defaults to 1e-4):
        Small number for numerical stability.

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
        Log binomial distribution logbinomial(p;t).
r	   r   rG   )	ndimrQ   r!   clampr   r   r   r   r   )r>   probabilitiestemperaturer   one_minus_probabilitiesys         r&   r`   LogBinomialSoftmax.forward  s     ")33A6M"'++a-.?a"HM:dnndjj1jj599]334~~

*eii8O.PPQ 	

 xxQx//r%   )r   r   )      ?-C6?)
r   r   r   r   r!   softmaxr0   r`   r$   rd   re   s   @r&   r   r     s    !$%-- j0 0r%   r   c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )%ZoeDepthConditionalLogBinomialSoftmaxi  c                   > [         TU ]  5         X#-   U-  n[        R                  " [        R                  " X#-   USSSS9[        R
                  " 5       [        R                  " USSSSS9[        R                  " 5       5      U l        SU l        UR                  U l	        UR                  U l
        [        U[        R                  S9U l        g)a  Per-pixel MLP followed by a Conditional Log Binomial softmax.

Args:
    in_features (`int`):
        Number of input channels in the main feature.
    condition_dim (`int`):
        Number of input channels in the condition feature.
    n_classes (`int`, *optional*, defaults to 256):
        Number of classes.
    bottleneck_factor (`int`, *optional*, defaults to 2):
        Hidden dim factor.

r   r   rl      r   )r   N)r/   r0   r   r;   ro   GELUSoftplusmlpp_epsmax_tempmin_tempr   r!   r   log_binomial_transform)r>   r?   in_featurescondition_dimr   bottleneck_factor
bottleneckrC   s          r&   r0   .ZoeDepthConditionalLogBinomialSoftmax.__init__  s    * 	!16GG
==IIk1:1UV`abGGIIIj%Qq!LKKM
 
&8&V#r%   c                    U R                  [        R                  " X4SS95      nUSS2SS2S4   USS2SS2S4   pTX@R                  -   nUSS2SS4   USS2SS4   USS2SS4   -   -  nXPR                  -   nUSS2SS4   USS2SS4   USS2SS4   -   -  nUR	                  S5      nU R
                  U R                  -
  U-  U R                  -   nU R                  XE5      $ )a2  
Args:
    main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
        Main feature.
    condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
        Condition feature.

Returns:
    `torch.Tensor`:
        Output log binomial distribution
r   rG   Nr.   .r   )r   r!   concatr   rQ   r   r   r   )r>   main_featurecondition_featureprobabilities_and_temperaturer   r   s         r&   r`   -ZoeDepthConditionalLogBinomialSoftmax.forward  s    )-|>_ef1g(h%)!RaR*5)!QR*5 #
 &

2%aCi0M!Q)4L}]^`acf]fOg4gh!JJ.!!Q),Aq#I0FUVXY[^U^I_0_`!++A.}}t}}4CdmmS**=FFr%   )r   r   r   r   r   )r   r.   r{   re   s   @r&   r   r     s     #WJG Gr%   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )ZoeDepthSeedBinRegressori  c                   > [         TU ]  5         UR                  U l        UR                  U l        X@l        XPl        [        R                  " U R                  USSS5      U l	        [        R                  " SS9U l        [        R                  " X2SSS5      U l        U R                  S:X  a  [        R                  " SS9U l        g[        R                  " 5       U l        g)a  Bin center regressor network.

Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.

Args:
    config (`int`):
        Model configuration.
    n_bins (`int`, *optional*, defaults to 16):
        Number of bin centers.
    mlp_dim (`int`, *optional*, defaults to 256):
        Hidden dimension.
    min_depth (`float`, *optional*, defaults to 1e-3):
        Min depth value.
    max_depth (`float`, *optional*, defaults to 10):
        Max depth value.
r   r   TinplacenormedN)r/   r0   bottleneck_featuresr   bin_centers_type	min_depth	max_depthr   ro   r   r   act1r   r   act2)r>   r?   n_binsmlp_dimr  r  rC   s         r&   r0   !ZoeDepthSeedBinRegressor.__init__  s    " 	!55 & 7 7""YYt//!QB
GGD)	YYw1a8
-1-B-Bh-NBGGD)	TVT_T_Ta	r%   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  S:X  a  US-   nX"R                  SSS9-  nU R                  U R                  -
  U-  n[        R                  R                  USSU R                  S9n[        R                  " USS	9nS
USS2SS2S4   USS2SS2S4   -   -  nX24$ X"4$ )zM
Returns tensor of bin_width vectors (centers). One vector b for every pixel
r  MbP?r   TrH   keepdim)r   r   r   r   r   r   constant)r   valuerG   g      ?NrI   .)r   r  r   r  r  sumr  r  r   r   padr!   cumsum)r>   xbin_centersbin_widths_normed
bin_widths	bin_edgess         r&   r`    ZoeDepthSeedBinRegressor.forward  s     JJqMIIaLJJqMiil  H,%,K +oo!To.R R..4>>9=NNJ**:7IPZbfbpbp*qJZQ7I1crc3;!7)Aqr3J:O!OPK$11 ++r%   )r  r  r  r   r   r   r  r  )   r   r  
   r{   re   s   @r&   r  r    s    b:, ,r%   r  alphagammac                 L    U R                  SXR                  U5      -  -   5      $ )a
  Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
This is the default one according to the accompanying paper.

Args:
    dx (`torch.Tensor`):
        The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
    alpha (`float`, *optional*, defaults to 300):
        Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
    gamma (`int`, *optional*, defaults to 2):
        Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
        Lower gamma = farther reach.

Returns:
    torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
r   )divpow)dxr+  r,  s      r&   inv_attractorr1  5  s#    " 66!effUm++,,r%   c                   >   ^  \ rS rSr    SU 4S jjrSS jrSrU =r$ )ZoeDepthAttractorLayeriI  c                   > [         T	U ]  5         UR                  U l        UR                  U l        UR                  U l        X0l        X l	        X@l
        XPl        X`l        UR                  =px[        R                  " XxSSS5      U l        [        R"                  " SS9U l        [        R                  " XS-  SSS5      U l        [        R"                  " SS9U l        g)za
Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
r   r   Tr  r.   N)r/   r0   attractor_alphar+  attractor_gammagemmaattractor_kindkindn_attractorsr  r  r  memory_efficientbin_embedding_dimr   ro   r   r   r  r   r  
r>   r?   r  r:  r  r  r;  r   r  rC   s
            r&   r0   ZoeDepthAttractorLayer.__init__J  s     	++
++
))	("" 0 !' 8 88YY{Q1=
GGD)	YYwq(8!QB
GGD)	r%   c                    Ub7  U(       a,  [         R                  R                  X1R                  SS SSS9nX-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUS-   nUR                  u  pgpUR                  X`R                  SX5      nUSS2SS2SS	4   n
[         R                  R                  X(U	4SSS9nU R                  (       d`  [        R                  [        R                  S
.U R                     nU" [        U
R!                  S5      UR!                  S5      -
  5      SS9nO[        R"                  " XR$                  S9n['        U R                  5       H+  nU[        U
SS2US	4   R!                  S5      U-
  5      -  nM-     U R                  S:X  a  XR                  -  nX-   nU R(                  U R*                  -
  U-  U R*                  -   n[        R,                  " USS9u  p[        R.                  " XR*                  U R(                  5      nX4$ )a  
The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
and the attractor points (the latter are predicted by the MLP).

Args:
    x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
        Feature block.
    prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
        Previous bin centers normed.
    prev_bin_embedding (`torch.Tensor`, *optional*):
        Optional previous bin embeddings.
    interpolate (`bool`, *optional*, defaults to `True`):
        Whether to interpolate the previous bin embeddings to the size of the input features.

Returns:
    `Tuple[`torch.Tensor`, `torch.Tensor`]:
        New bin centers normed and scaled.
Nr   Tr   r   r  r.   r   .meanr   r   rG   devicerC  )r   r   r   rK   r   r  r   r  r   r:  r;  r!   rC  r   r9  r1  rQ   
zeros_likerE  r   r  r  sortclip)r>   r#  prev_binprev_bin_embeddingr   
attractorsrW   rB   heightwidthattractors_normedr$  funcdelta_cr   bin_new_centerss                   r&   r`   ZoeDepthAttractorLayer.forwardi  s   & )%']]%>%>&:UY &? &" &AJJqMIIaLJJqMYYq\
$&
'1'7'7$
v__Z1B1BAvU
 'q!Q|4mm//5/PZjn/o $$!JJuyy9$))DD=):)D)DQ)G+J_J_`aJb)bcijkG&&{;M;MNG4,,-=):1a9)E)O)OPQ)RU`)`aa . yyF"!$5$55%/~~6/IDNNZKQ7jjnndnnM++r%   )r  r  r+  r   r   r7  r9  r  r;  r  r:  r  )r)  r  r*  FNTr{   re   s   @r&   r3  r3  I  s     
 *><, <,r%   r3  c                   >   ^  \ rS rSr    SU 4S jjrSS jrSrU =r$ )ZoeDepthAttractorLayerUnnormedi  c                   > [         T	U ]  5         X0l        X l        X@l        XPl        UR                  U l        UR                  U l        UR                  U l
        X`l        UR                  =px[        R                  " XxSSS5      U l        [        R                   " SS9U l        [        R                  " XSSS5      U l        [        R&                  " 5       U l        g)z<
Attractor layer for bin centers. Bin centers are unbounded
r   r   Tr  N)r/   r0   r:  r  r  r  r5  r+  r,  r8  r9  r;  r<  r   ro   r   r   r  r   r   r  r=  s
            r&   r0   'ZoeDepthAttractorLayerUnnormed.__init__  s     	(""++
++
))	 0 & 8 88YY{Q1=
GGD)	YYwaA>
KKM	r%   c                 V   Ub7  U(       a,  [         R                  R                  X1R                  SS SSS9nX-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS u  pg[         R                  R                  X&U4SSS9nU R                  (       d`  [        R                  [        R                  S.U R                     n	U	" [        UR                  S5      UR                  S5      -
  5      SS	9n
O[        R                  " XR                   S
9n
[#        U R$                  5       H+  nU
[        USS2US4   R                  S5      U-
  5      -  n
M-     U R                  S:X  a  XR$                  -  n
X-   nUnX4$ )a,  
The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
and the attractor points (the latter are predicted by the MLP).

Args:
    x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
        Feature block.
    prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
        Previous bin centers normed.
    prev_bin_embedding (`torch.Tensor`, *optional*):
        Optional previous bin embeddings.
    interpolate (`bool`, *optional*, defaults to `True`):
        Whether to interpolate the previous bin embeddings to the size of the input features.

Returns:
    `Tuple[`torch.Tensor`, `torch.Tensor`]:
        New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
Nr@  r   TrA  rB  r.   r   rG   rD  .rC  )r   r   r   rK   r   r  r   r  r;  r!   rC  r   r9  r1  rQ   rF  rE  r   r:  )r>   r#  rI  rJ  r   rK  rL  rM  r$  rO  rP  r   rQ  s                r&   r`   &ZoeDepthAttractorLayerUnnormed.forward  s   & )%']]%>%>&:UY &? &" &AJJqMIIaLJJqMYYq\
"((-mm//5/PZjn/o$$!JJuyy9$))DD=)=)=a)@;CXCXYZC[)[\bcdG&&{;M;MNG4,,-=Aq#I)>)H)H)Kk)YZZ . yyF"!$5$55%/%++r%   )r  r  r+  r   r   r,  r9  r  r;  r  r:  r  )r)  r  r*  TrS  r{   re   s   @r&   rU  rU    s     
 ":3, 3,r%   rU  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )ZoeDepthProjectori  c                    > [         TU ]  5         [        R                  " XSSS5      U l        [        R
                  " SS9U l        [        R                  " X2SSS5      U l        g)zProjector MLP.

Args:
    in_features (`int`):
        Number of input channels.
    out_features (`int`):
        Number of output channels.
    mlp_dim (`int`, *optional*, defaults to 128):
        Hidden dimension.
r   r   Tr  N)r/   r0   r   ro   r   r   r   r   )r>   r   out_featuresr  rC   s       r&   r0   ZoeDepthProjector.__init__  sL     	YY{Q1=
774(YYwaA>
r%   r_   rE   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rw   )r   r   r   ry   s     r&   r`   ZoeDepthProjector.forward  s2    zz,/xx-zz,/r%   )r   r   r   )   )
r   r   r   r   r0   r!   rc   r`   r$   rd   re   s   @r&   r[  r[    s(    ?"ELL U\\  r%   r[  c                     ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jr  SS\R                  S\R                  S	\R                  S
\	\R                     S\	\   S\\R                     4S jjrSrU =r$ )ZoeDepthMultiheadAttentioni  zKEquivalent implementation of nn.MultiheadAttention with `batch_first=True`.c                   > [         TU ]  5         X-  S:w  a  [        SU SU S35      eX l        [	        X-  5      U l        U R                  U R
                  -  U l        [        R                  " XR                  5      U l	        [        R                  " XR                  5      U l
        [        R                  " XR                  5      U l        [        R                  " X5      U l        [        R                  " U5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())r/   r0   r   num_attention_headsrt   attention_head_sizeall_head_sizer   r<   querykeyr  out_projDropoutdropout)r>   rA   rf  rm  rC   s       r&   r0   #ZoeDepthMultiheadAttention.__init__  s    ,1#K= 1-.a1 
 $7 #&{'H#I !558P8PPYY{,>,>?
99[*<*<=YY{,>,>?
		+;zz'*r%   r#  rE   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )NrI   r   r.   r   r	   )r   rf  rg  r   rN   )r>   r#  new_x_shapes      r&   transpose_for_scores/ZoeDepthMultiheadAttention.transpose_for_scores/  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r%   querieskeysvaluesattention_maskoutput_attentionsc                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n[        R
                  " XgR                  SS5      5      n	U	[        R                  " U R                  5      -  n	Ub  X-   n	[        R                  R                  U	SS9n
U R                  U
5      n
[        R
                  " X5      nUR                  SSSS5      R                  5       nUR!                  5       S S U R"                  4-   nUR%                  U5      nU R'                  U5      nU(       a  X4nU$ U4nU$ )NrI   r@  rG   r   r.   r   r	   )rq  ri  rj  r  r!   matmul	transposemathsqrtrg  r   r   r   rm  rN   rO   r   rh  r   rk  )r>   rs  rt  ru  rv  rw  query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r&   r`   "ZoeDepthMultiheadAttention.forward4  sQ    //

70CD--dhhtn=	//

60BC !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCm46G=2 O\M]r%   )rh  rg  rm  rj  rf  rk  ri  r  )NF)r   r   r   r   r    r0   r!   rc   rq  r   r"   boolr   r`   r$   rd   re   s   @r&   rc  rc    s    U+(%ell %u|| % 7;,1%% ll% 	%
 !!2!23% $D>% 
u||	% %r%   rc  c                   Z   ^  \ rS rSrSU 4S jjr SS\\R                     4S jjrSr	U =r
$ )ZoeDepthTransformerEncoderLayeri\  c                   > [         TU ]  5         UR                  nUR                  nUR                  n[        XFUS9U l        [        R                  " XE5      U l	        [        R                  " U5      U l        [        R                  " XT5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        [$        U   U l        g )N)rm  )r/   r0   patch_transformer_hidden_size#patch_transformer_intermediate_size%patch_transformer_num_attention_headsrc  	self_attnr   r<   linear1rl  rm  linear2	LayerNormnorm1norm2dropout1dropout2r
   
activation)r>   r?   rm  r  rA   intermediate_sizerf  rC   s          r&   r0   (ZoeDepthTransformerEncoderLayer.__init__]  s    ::"FF$JJ3K^efyy@zz'*yy!2@\\+.
\\+.


7+

7+ ,r%   src_maskc           	      >   U=p4U R                  X4XS9S   nXR                  U5      -   nU R                  U5      nU R                  U R	                  U R                  U R                  U5      5      5      5      nXR                  U5      -   nU R                  U5      nU$ )N)rs  rt  ru  rv  r   )	r  r  r  r  rm  r  r  r  r  )r>   srcr  rs  rt  src2s         r&   r`   'ZoeDepthTransformerEncoderLayer.forwardq  s    
 ~~g~^_`aMM$''jjo||DLLc9J)KLMMM$''jjo
r%   )	r  rm  r  r  r  r  r  r  r  )g?relurw   )r   r   r   r   r0   r   r!   rc   r`   r$   rd   re   s   @r&   r  r  \  s*    -. ,0 5<<( r%   r  c                   P   ^  \ rS rSrU 4S jrS\R                  4S jrS rSr	U =r
$ )ZoeDepthPatchTransformerEncoderi  c                    > [         TU ]  5         UR                  n[        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        R                  " X!R                  SSSS9U l        gs  snf )zViT-like transformer block

Args:
    config (`ZoeDepthConfig`):
        Model configuration class defining the model architecture.
r   r   rl   N)r/   r0   r  r   r2   r   num_patch_transformer_layersr  transformer_encoderro   r  embedding_convPxP)r>   r?   ri   rB   rC   s       r&   r0   (ZoeDepthPatchTransformerEncoder.__init__  s{     	00#%==>CFDgDg>hi>h,V4>hi$
  "$==1UV`a"
 js   Bcpuc           
         [         R                  " SX%US9R                  S5      n[         R                  " SUSXTS9R                  S5      n[         R                  " U[         R                  " [         R
                  " SUS95      * U-  -  5      nXh-  n	[         R                  " [         R                  " U	5      [         R                  " U	5      /SS9n	U	R                  SS9R                  USS5      n	U	$ )zGenerate positional encodings

Args:
    sequence_length (int): Sequence length
    embedding_dim (int): Embedding dimension

Returns:
    torch.Tensor: Positional encodings.
r   )dtyperE  r   r.   g     @rD  rG   )
r!   r   rQ   expr   r   rL   sincosrepeat)
r>   rW   rZ   embedding_dimrE  r  positionindexdiv_termpos_encodings
             r&   positional_encoding_1d6ZoeDepthPatchTransformerEncoder.positional_encoding_1d  s     <<?OYYZ[\QqMWWXYZ99Uuyygf1U'V&VYf&fgh*yy%))L"9599\;R!SYZ[#--!-4;;J1Mr%   c           	      f   U R                  U5      R                  S5      n[        R                  R	                  US5      nUR                  SSS5      nUR                  u  p4nX R                  X4XRR                  UR                  S9-   n[        S5       H  nU R                  U   " U5      nM     U$ )zForward pass

Args:
    x (torch.Tensor - NCHW): Input feature tensor

Returns:
    torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
r.   )r   r   r   r   )rE  r  r   )r  rP   r   r   r!  rN   rK   r  rE  r  r   r  )r>   r#  
embeddingsrW   rZ   r  r   s          r&   r`   'ZoeDepthPatchTransformerEncoder.forward  s     ++A.66q9
]]&&z6:
''1a0
5?5E5E2
]"="=?P?PXbXhXh #> #
 

 qA11!4Z@J  r%   )r  r  )r   r   r   r   r0   r!   float32r  r`   r$   rd   re   s   @r&   r  r    s'    
& Y^ejerer $ r%   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )ZoeDepthMLPClassifieri  c                    > [         TU ]  5         Un[        R                  " X5      U l        [        R
                  " 5       U l        [        R                  " X25      U l        g rw   )r/   r0   r   r<   r  r   r  r  )r>   r   r]  hidden_featuresrC   s       r&   r0   ZoeDepthMLPClassifier.__init__  s@    %yy>'')yy?r%   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rw   )r  r  r  )r>   r_   r   s      r&   r`   ZoeDepthMLPClassifier.forward  s2    ||L1|4\2r%   )r  r  r  )rE   Nr{   re   s   @r&   r  r    s    @ r%   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )*ZoeDepthMultipleMetricDepthEstimationHeadsi  zf
Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
c                   > [         TU ]  5         UR                  nUR                  nUR                  U l        UR
                  U l        UR                  n[        R                  " XDSSSS9U l	        [        U5      U l        [        SSS9U l        U R
                  S:X  a  [        nOU R
                  S:X  a  [        n[        R                   " UR                   Vs0 s H   nUS	   [#        UUS
   US-  US   US   S9_M"     sn5      U l        ['        XBUS-  S9U l        [        R*                  " [-        S5       Vs/ s H  n['        UR.                  UUS-  S9PM     sn5      U l        [        R                   " UR                   VV	s0 s HO  nUS	   [        R*                  " [-        [3        U5      5       V	s/ s H  n	W" UX9   US   US   S9PM     sn	5      _MQ     sn	n5      U l        UR6                  n
[        R                   " UR                   Vs0 s H  nUS	   [9        UU
UUS
   SS9_M     sn5      U l        g s  snf s  snf s  sn	f s  sn	nf s  snf )Nr   r   rl   ra  r.   r   r]  r  softplusnamer  r  r  )r  r  r  r  )r   r]  r  r   r  r  r  )r  )r/   r0   r<  num_attractorsbin_configurationsr  r  r   ro   r   r  patch_transformerr  mlp_classifierr3  rU  
ModuleDictr  seed_bin_regressorsr[  seed_projectorr2   r   r   
projectorsr   rK  r   r   conditional_log_binomial)r>   r?   r<  r:  r  	AttractorconfrB   configurationr   last_inrC   s              r&   r0   3ZoeDepthMultipleMetricDepthEstimationHeads.__init__  s   "44,,"(";"; & 7 7 %88YY2UV_`jkl
 "A!H3RST   H,.I""j06I $&== #55	 6D V6>-2";/";/  6	$
  0+UfjkUk
 -- q "A " & 9 9!2-2
 "	
 -- &,%>%> &?M f%r}} "'s<'8!9 ":A ""#/?&3K&@&3K&@	 ":
( 
 &?
" ..(* &,%>%>	 &?M f%'L%!(+&'(  &?	)
%]	 &	s*   'I>"I0I
;II
II
c                 b   U R                  U5      nU R                  U5      S S 2SS S 24   nU R                  U5      n[        R                  " UR                  SSS9SS9nU R                   V	s/ s H  oS   PM	     n
n	U
[        R                  " USS9R                  5       R                  5          n U R                   Vs/ s H  oS   U:X  d  M  UPM     snS   nUS	   nUS
   nU R                  U   nU" U5      u  nnU R                  S;   a  UU-
  X-
  -  nOUnU R                  U5      nU R                  U   n[!        U R"                  UU5       H  u  nnnU" U5      nU" UUUSS9u  nnUnUnM!     Un[$        R&                  R)                  WUR*                  SS  SSS9n[$        R&                  R)                  WUR*                  SS  SSS9nU R,                  U   nU" UU5      n[        R
                  " UU-  SSS9nUU4$ s  sn	f s  snf ! [         a    [        SU S35      ef = f)Nr   Tr  rI   rG   r  zbin_configurations_name z! not found in bin_configurationssr  r  r  hybrid2r   r@  r   rA  r   )r   r  r  r!   r   r   r  argmaxr   item
IndexErrorr   r  r  r  rK  r4   r  r   r   r   rK   r  )r>   outconv_activationr  feature_blocksrelative_depthr#  	embeddingr   domain_voter  namesbin_configurations_namer?   r  r  r  seed_bin_regressorrB   seed_bin_centersrI  rJ  rK  	projector	attractorr   bin_embeddingbinr$  lastr  r]   s                                  r&   r`   2ZoeDepthMultipleMetricDepthEstimationHeads.forward)  su   JJz" **1-aAg6	 ++I6mmM$5$5!T$5$JPRS =A<S<ST<S=v&<ST"'[b(I(Q(Q(S(X(X(Z"[	t)-)@)@n)@v6NVmDmF)@nopqD %	%	!556MN03  $99(949NOH'H!003__%<=
-0*n-])Iy'%g.M(BTbfgCH!.	 .^ "mm//TZZ_S]mq/r11-BCWaqu1v#'#@#@AX#Y $T=9 iiKQ=M!!K U o 	t78O7PPqrss	ts*   /H
4H HHH H H.)
rK  r  r  r  r   r  r  r  r  r  r   re   s   @r&   r  r    s    R
h1" 1"r%   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )!ZoeDepthMetricDepthEstimationHeadi]  c                   > [         TU ]  5         UR                  S   nUS   nUS   nUS   nUR                  nUR                  nUR
                  nX@l        XPl        Xl        UR                  n	[        R                  " XSSSS9U l        U R
                  S:X  a  [        n
OU R
                  S:X  a  [        n
[        XXES	9U l        [!        XS
9U l        [        R$                  " ['        S5       Vs/ s H  n[!        UR(                  US
9PM     sn5      U l        [        R$                  " ['        S5       Vs/ s H  nW
" UUX|   UUS9PM     sn5      U l        UR.                  S-   n[1        UUUUS9U l        g s  snf s  snf )Nr   r  r  r  r   rl   r  r  r  r  r   )r  r:  r  r  )r   )r/   r0   r  r<  r  r  r  r  r  r   ro   r   r3  rU  r  r  r[  r  r2   r   r   r  rK  r   r   r  )r>   r?   bin_configurationr  r  r  r<  r:  r  r  r  rB   r   r  rC   s                 r&   r0   *ZoeDepthMetricDepthEstimationHead.__init__^  s   "55a8"8,%k2	%k2	"44,,!22"" 0 %88YY2UV_`jkl
   H,.I""j06I":Y#
 0<Op-- q!A "f.G.GVgh!
 -- q	 "A !!-'' "	
 ..2 )N	)
%+	s   3E=:Fc                 Z   U R                  U5      nU R                  U5      u  pgU R                  S;   a)  XpR                  -
  U R                  U R                  -
  -  nOUnU R                  U5      n	[        U R                  U R                  U5       H8  u  pnU
" U5      nU" XU	SS9u  pUR                  5       nUR                  5       n	M:     UnUR                  S5      n[        R                  R                  UUR                  SS  SSS9n[        R                   " UU/SS9n[        R                  R                  WUR                  S	S  SSS
9nU R#                  UU5      n[        R                  R                  WUR                  S	S  SSS
9n[        R$                  " X_-  SSS9nUS 4$ )Nr  Tr  r   r.   r   r   rG   r@  rA  r  )r   r  r  r  r  r  r4   r  rK  clonerQ   r   r   r   rK   r!   rL   r  r   )r>   r  r  r  r  r#  rB   r  rI  rJ  r  r  r   r  r  r$  r  relative_conditioningr]   s                      r&   r`   )ZoeDepthMetricDepthEstimationHead.forward  s   JJz""55a8  $99(>>9dnnt~~>]^H'H!003 .1$//Sa-b)I'%g.M(BTbfgCyy{H!.!4!4!6	 .c " !/ 8 8 ; " 9 9!

12ZW[ !: !
 yy$ 56A>11-BCWaqu1v))$> mm//QWWRS\PZjn/oiiQ=Dyr%   )	rK  r  r  r   r  r  r  r  r  r{   re   s   @r&   r  r  ]  s    9
v" "r%   r  c                   *    \ rS rSr\rSrSrSrS r	Sr
g)ZoeDepthPreTrainedModeli  zoedepthpixel_valuesTc                 (   [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        gg)zInitialize the weightsg        )rC  stdNr   )r   r   r<   ro   rq   weightdatanormal_r?   initializer_ranger   zero_r  fill_)r>   modules     r&   _init_weights%ZoeDepthPreTrainedModel._init_weights  s    fryy"))R5G5GHII MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r%   r   N)r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r$   r   r%   r&   r  r    s    !L"$O&*#
*r%   r  zU
    ZoeDepth model with one or multiple metric depth estimation head(s) on top.
    )custom_introc                      ^  \ rS rSrU 4S jr\    SS\R                  S\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S	 jj5       rS
rU =r$ )ZoeDepthForDepthEstimationi  c                 L  > [         TU ]  U5        [        U5      U l        [	        U R                  R
                  S5      (       ap  [	        U R                  R
                  S5      (       aK  U R                  R
                  R                  Ul        U R                  R
                  R                  U l        O[        S5      e[        U5      U l        [        U5      U l        [        UR                  5      S:  a  [!        U5      O
[#        U5      U l        U R'                  5         g )NrA   
patch_sizezXZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributesr   )r/   r0   r   backbonehasattrr?   rA   r:   r	  r   r   neckr   relative_headr   r  r  r  metric_head	post_init)r>   r?   rC   s     r&   r0   #ZoeDepthForDepthEstimation.__init__  s     %f-4==''77GDMMDXDXZf<g<g*.--*>*>*J*JF'"mm22==DOj  !(	@H 6,,-1 7v>26: 	 	r%   r  labelsrw  output_hidden_statesreturn_dictrE   c                    SnUb  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nU R
                  R                  XUS9nUR                  nUR                  u    pnU R                  nX-  nX-  nU R                  XU5      u  pU/U-   nU R                  U5      u  nnU/U-   nU R                  US   US   USS US9u  nnUR                  SS9nU(       d"  Ub  UU4USS -   nO	U4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                   S	9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth depth estimation maps for computing the loss.

Examples:
```python
>>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
>>> import torch
>>> import numpy as np
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
>>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # interpolate to original size
>>> post_processed_output = image_processor.post_process_depth_estimation(
...     outputs,
...     source_sizes=[(image.height, image.width)],
... )

>>> # visualize the prediction
>>> predicted_depth = post_processed_output[0]["predicted_depth"]
>>> depth = predicted_depth * 255 / predicted_depth.max()
>>> depth = depth.detach().cpu().numpy()
>>> depth = Image.fromarray(depth.astype("uint8"))
```NzTraining is not implemented yet)r  rw  r   r   r.   )r  r  r  r  rG   )r   r   r   r   r   )NotImplementedErrorr?   use_return_dictr  rw  r
  forward_with_filtered_kwargsfeature_mapsrK   r	  r  r  r  r   r   r   r   )r>   r  r  rw  r  r  r   r  r   rB   rL  rM  r	  rU   rV   r   r]   r  metric_depthr   r   s                        r&   r`   "ZoeDepthForDepthEstimation.forward  s   Z %&GHH%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq--<<Wh = 
  ,,*001e__
+)"&))M"Uj=(#'#5#5m#D j3&*&6&6"1v#a&QRao '7 '
#m $+++2(&6D&712;6)-)9TGf$EvE+('!//))
 	
r%   )r
  r  r  r	  r  )NNNN)r   r   r   r   r0   r   r!   r"   r   
LongTensorr  r   r   rc   r   r`   r$   rd   re   s   @r&   r  r    s    2  .2,0/3&*[
''[
 ))*[
 $D>	[

 'tn[
 d^[
 
uU\\"$88	9[
 [
r%   r  )gHz>)i,  r.   );r    r{  dataclassesr   typingr   r   r   r   r!   torch.utils.checkpointr   activationsr
   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_zoedepthr   
get_loggerr   loggerr   Moduler(   r8   r}   r   r   r   r   r   r   r   r  jitscriptfloatrt   r1  r3  rU  r[  rc  r  r  r  r  r  r  r  __all__r   r%   r&   <module>r,     s5     ! / /    ! 4 - 9 9 1 2 
		H	% ?; ? ?BGbii GTbii 0# #2;'")) ;'~" "J3$299 3$l))")) ))XR)0 )0X@GBII @GF5,ryy 5,p -U - - -&\,RYY \,~Q,RYY Q,h		 6B BJ!bii !H<bii <~BII "J" J"Z^		 ^F *o * *& 
v
!8 v

v
r ()B
Cr%   