o
    Zh~                     @   sp  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ eeZeG dd deZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%dEdd Z&G d!d" d"ejZ'G d#d$ d$ejZ(G d%d& d&ejZ)e	j*j+dFd)e,d*e-fd+d,Z.G d-d. d.ejZ/G d/d0 d0ejZ0G d1d2 d2ejZ1G d3d4 d4ejZ2G d5d6 d6ejZ3G d7d8 d8ejZ4G d9d: d:ejZ5G d;d< d<ejZ6G d=d> d>ejZ7eG d?d@ d@eZ8edAdBG dCdD dDe8Z9dDd@gZ:dS )GzPyTorch ZoeDepth model.    N)	dataclass)ListOptionalTupleUnion)nn   )ACT2FN)DepthEstimatorOutput)PreTrainedModel)ModelOutputauto_docstringlogging)load_backbone   )ZoeDepthConfigc                   @   s|   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeejdf  ed< dZeeejdf  ed< dS )	ZoeDepthDepthEstimatorOutputa  
    Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Predicted depth for each pixel.

        domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
            Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlosspredicted_depthdomain_logits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r    r   r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/zoedepth/modeling_zoedepth.pyr   $   s   
 r   c                       >   e Zd ZdZ fddZdeej deej fddZ  Z	S )ZoeDepthReassembleStageaE  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
    c              	      s   t    |j| _t | _t|j|jD ]\}}| j	t
|||d q|jdkrKt | _|j}|jD ]}| j	ttd| |t|j  q5d S d S )N)channelsfactorproject   )super__init__readout_typer   
ModuleListlayerszipneck_hidden_sizesZreassemble_factorsappendZoeDepthReassembleLayerreadout_projectsbackbone_hidden_size
SequentialLinearr	   Z
hidden_act)selfconfigZneck_hidden_sizer$   hidden_size_	__class__r   r    r(   V   s   




z ZoeDepthReassembleStage.__init__r   returnc                 C   s6  |d j d }tj|dd}|dddf |ddddf }}|j \}}}|||||}|dddd }| jdkrX|dd}|jdd	|}	t||	fd	}n| jd
krd||d	 }g }
t
|j|ddD ])\}}| jdkr| j| |}|ddd|d	||}| j| |}|
| qo|
S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        r   dimNr   r   r&   r%   )r   r&   r   add)shaper   catZreshapepermute
contiguousr)   flatten	unsqueezeZ	expand_as	enumeratesplitr0   r+   r.   )r4   r   patch_heightpatch_width
batch_sizeZ	cls_tokenZtotal_batch_sizesequence_lengthZnum_channelsZreadoutoutZ	stage_idxhidden_stater   r   r    forwardg   s(   &


zZoeDepthReassembleStage.forward
r   r   r   r   r(   r   r   TensorrM   __classcell__r   r   r8   r    r"   F   s    &r"   c                       $   e Zd Z fddZdd Z  ZS )r/   c                    s   t    |j}tj||dd| _|dkr"tj||||dd| _d S |dkr-t | _d S |dk rBtj||dt	d| dd| _d S d S )Nr   )in_channelsZout_channelskernel_sizer   rS   stridepaddingr   )
r'   r(   r1   r   Conv2d
projectionConvTranspose2dresizeZIdentityint)r4   r5   r#   r$   r6   r8   r   r    r(      s   
"z ZoeDepthReassembleLayer.__init__c                 C   s   |  |}| |}|S N)rX   rZ   r4   rL   r   r   r    rM      s   

zZoeDepthReassembleLayer.forwardr   r   r   r(   rM   rP   r   r   r8   r    r/      s    r/   c                       rQ   )ZoeDepthFeatureFusionStagec                    s<   t    t | _tt|jD ]
}| jt	| qd S r\   )
r'   r(   r   r*   r+   rangelenr-   r.   ZoeDepthFeatureFusionLayer)r4   r5   r7   r8   r   r    r(      s
   

z#ZoeDepthFeatureFusionStage.__init__c                 C   sV   |d d d }g }d }t || jD ]\}}|d u r||}n|||}|| q|S )Nr=   )r,   r+   r.   )r4   r   Zfused_hidden_statesZfused_hidden_staterL   layerr   r   r    rM      s   

z"ZoeDepthFeatureFusionStage.forwardr^   r   r   r8   r    r_      s    r_   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )ZoeDepthPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
    c                    s   t    |j| _|jd ur|jn| j }t | _tj|j	|j	ddd|d| _
t | _tj|j	|j	ddd|d| _| jrTtj|j	|jd| _tj|j	|jd| _d S d S )Nr   r   )rS   rU   rV   bias)eps)r'   r(   Z!use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rW   fusion_hidden_sizeconvolution1activation2convolution2ZBatchNorm2dZbatch_norm_epsbatch_norm1batch_norm2)r4   r5   rh   r8   r   r    r(      s8   



		z$ZoeDepthPreActResidualLayer.__init__rL   r:   c                 C   sT   |}|  |}| |}| jr| |}| |}| |}| jr&| |}|| S r\   )rj   rl   rg   ro   rm   rn   rp   r4   rL   Zresidualr   r   r    rM      s   





z#ZoeDepthPreActResidualLayer.forward)	r   r   r   r   r(   r   rO   rM   rP   r   r   r8   r    rd      s    	"rd   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
rb   a8  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tc                    s@   t    || _tj|j|jddd| _t|| _t|| _	d S )Nr   T)rS   re   )
r'   r(   align_cornersr   rW   rk   rX   rd   residual_layer1residual_layer2)r4   r5   rr   r8   r   r    r(   
  s
   

z#ZoeDepthFeatureFusionLayer.__init__Nc                 C   st   |d ur#|j |j krtjj||j d |j d fddd}|| | }| |}tjj|dd| jd}| |}|S )Nr&   r   bilinearFsizemoderr   Zscale_factorrx   rr   )r?   r   
functionalinterpolaters   rt   rr   rX   rq   r   r   r    rM     s   


z"ZoeDepthFeatureFusionLayer.forward)Tr\   r   r   r   r   r(   rM   rP   r   r   r8   r    rb      s    	
rb   c                       r!   )ZoeDepthNeckaO  
    ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:

    * ZoeDepthReassembleStage
    * ZoeDepthFeatureFusionStage.

    Args:
        config (dict): config dict.
    c              
      sz   t    || _|jd ur|jjdv rd | _nt|| _t | _	|j
D ]}| j	tj||jdddd q$t|| _d S )N)Zswinv2r   r   F)rS   rV   re   )r'   r(   r5   Zbackbone_configZ
model_typereassemble_stager"   r   r*   convsr-   r.   rW   rk   r_   fusion_stage)r4   r5   Zchannelr8   r   r    r(   2  s   



 zZoeDepthNeck.__init__r   r:   c                    sv   t |ttfstdt|t jjkrtd jdur% |||} fddt	|D } 
|}||d fS )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                    s   g | ]\}} j | |qS r   )r   ).0ifeaturer4   r   r    
<listcomp>S  s    z(ZoeDepthNeck.forward.<locals>.<listcomp>r=   )
isinstancetuplelist	TypeErrorra   r5   r-   
ValueErrorr~   rE   r   )r4   r   rG   rH   featuresoutputr   r   r    rM   C  s   

zZoeDepthNeck.forwardrN   r   r   r8   r    r}   %  s    &r}   c                       s:   e Zd ZdZ fddZdeej dejfddZ  Z	S )#ZoeDepthRelativeDepthEstimationHeada  
    Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
    supplementary material).
    c                    s   t    |j| _d | _|jrtjdddddd| _|j}tj||d dddd| _tj	ddd	d
| _
tj|d |jdddd| _tj|jddddd| _d S )N   )r   r   )r   r   rT   r&   r   r   ru   Try   r   )r'   r(   head_in_indexrX   Zadd_projectionr   rW   rk   conv1ZUpsampleupsamplenum_relative_featuresconv2conv3)r4   r5   r   r8   r   r    r(   b  s   
z,ZoeDepthRelativeDepthEstimationHead.__init__r   r:   c                 C   s   || j  }| jd ur| |}t |}| |}| |}| |}t |}|}| |}t |}|jdd}||fS )Nr   r;   )	r   rX   r   ri   r   r   r   r   squeeze)r4   r   r   r   r   r   r    rM   q  s   






z+ZoeDepthRelativeDepthEstimationHead.forwardrN   r   r   r8   r    r   [  s    "r   Hz>c                 C   sF   | | } || }| t |  |t |  | | t | | |   S )z%log(nCk) using stirling approximation)r   log)nkrf   r   r   r    	log_binom  s   6r   c                       s.   e Zd Zdejf fdd	ZdddZ  ZS )	LogBinomialSoftmaxr   c                    sh   t    || _|| _| jdtd|dddddd | jdt| jd gdddddd dS )	a7  Compute log binomial distribution for n_classes

        Args:
            n_classes (`int`, *optional*, defaults to 256):
                Number of output classes.
            act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
                Activation function to apply to the output.
        k_idxr   r   r=   F)
persistent	k_minus_1N)	r'   r(   r   actZregister_bufferr   arangeviewtensor)r4   	n_classesr   r8   r   r    r(     s
   
	$.zLogBinomialSoftmax.__init__      ?-C6?c                 C   sz   |j dkr
|d}td| |d}t||d}t| j| j| jt|  | j| j t|  }| j|| ddS )a  Compute the log binomial distribution for probabilities.

        Args:
            probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Tensor containing probabilities of each class.
            temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
                Temperature of distribution.
            eps (`float`, *optional*, defaults to 1e-4):
                Small number for numerical stability.

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
                Log binomial distribution logbinomial(p;t).
        r   r   r;   )	ndimrD   r   clampr   r   r   r   r   )r4   probabilitiestemperaturerf   Zone_minus_probabilitiesyr   r   r    rM     s   

zLogBinomialSoftmax.forward)r   r   )r   r   r   r   softmaxr(   rM   rP   r   r   r8   r    r     s    r   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )%ZoeDepthConditionalLogBinomialSoftmaxr   r&   c                    s~   t    || | }ttj|| |ddddt tj|dddddt | _d| _|j	| _	|j
| _
t|tjd| _dS )a  Per-pixel MLP followed by a Conditional Log Binomial softmax.

        Args:
            in_features (`int`):
                Number of input channels in the main feature.
            condition_dim (`int`):
                Number of input channels in the condition feature.
            n_classes (`int`, *optional*, defaults to 256):
                Number of classes.
            bottleneck_factor (`int`, *optional*, defaults to 2):
                Hidden dim factor.

        r   r   rT      r   )r   N)r'   r(   r   r2   rW   ZGELUSoftplusmlpp_epsmax_tempmin_tempr   r   r   log_binomial_transform)r4   r5   in_featuresZcondition_dimr   bottleneck_factor
bottleneckr8   r   r    r(     s   
z.ZoeDepthConditionalLogBinomialSoftmax.__init__c                 C   s   |  tj||fdd}|dddddf |dddddf }}|| j }|ddddf |ddddf |ddddf   }|| j }|ddddf |ddddf |ddddf   }|d}| j| j | | j }| ||S )az  
        Args:
            main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Main feature.
            condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
                Condition feature.

        Returns:
            `torch.Tensor`:
                Output log binomial distribution
        r   r;   Nr&   .r   )r   r   concatr   rD   r   r   r   )r4   Zmain_featureZcondition_featureZprobabilities_and_temperaturer   r   r   r   r    rM     s   
6
6
z-ZoeDepthConditionalLogBinomialSoftmax.forward)r   r&   r^   r   r   r8   r    r     s
    %r   c                       s&   e Zd Zd	 fdd	Zdd Z  ZS )
ZoeDepthSeedBinRegressor   r   MbP?
   c                    s   t    |j| _|j| _|| _|| _t| j|ddd| _	tj
dd| _t||ddd| _| jdkr=tj
dd| _dS t | _dS )ad  Bin center regressor network.

        Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.

        Args:
            config (`int`):
                Model configuration.
            n_bins (`int`, *optional*, defaults to 16):
                Number of bin centers.
            mlp_dim (`int`, *optional*, defaults to 256):
                Hidden dimension.
            min_depth (`float`, *optional*, defaults to 1e-3):
                Min depth value.
            max_depth (`float`, *optional*, defaults to 10):
                Max depth value.
        r   r   TZinplacenormedN)r'   r(   bottleneck_featuresr   bin_centers_type	min_depth	max_depthr   rW   r   ri   act1r   r   act2)r4   r5   n_binsmlp_dimr   r   r8   r   r    r(     s   
*z!ZoeDepthSeedBinRegressor.__init__c                 C   s   |  |}| |}| |}| |}| jdkr\|d }||jddd }| j| j | }tj	j
|dd| jd}tj|dd	}d
|dddddf |dddddf   }||fS ||fS )z]
        Returns tensor of bin_width vectors (centers). One vector b for every pixel
        r   r   r   Tr<   Zkeepdim)r   r   r   r   r   r   Zconstant)rx   valuer;   g      ?Nr=   .)r   r   r   r   r   sumr   r   r   rz   padr   Zcumsum)r4   xbin_centersZbin_widths_normedZ
bin_widthsZ	bin_edgesr   r   r    rM     s   




0z ZoeDepthSeedBinRegressor.forward)r   r   r   r   r^   r   r   r8   r    r     s    r   ,  r&   alphagammac                 C   s   |  d|| |  S )a:  Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
    This is the default one according to the accompanying paper.

    Args:
        dx (`torch.Tensor`):
            The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
        alpha (`float`, *optional*, defaults to 300):
            Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
        gamma (`int`, *optional*, defaults to 2):
            Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
            Lower gamma = farther reach.

    Returns:
        torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
    r   )divpow)Zdxr   r   r   r   r    inv_attractor5  s   r   c                       s0   e Zd Z				d fdd	Zdd	d
Z  ZS )ZoeDepthAttractorLayerr   r   r   Fc           	         s   t    |j| _|j| _|j| _|| _|| _	|| _
|| _|| _|j }}t||ddd| _tjdd| _t||d ddd| _tjdd| _dS )zq
        Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
        r   r   Tr   r&   N)r'   r(   attractor_alphar   Zattractor_gammaZgemmaattractor_kindkindn_attractorsr   r   r   memory_efficientbin_embedding_dimr   rW   r   ri   r   r   r   	r4   r5   r   r   r   r   r   r   r   r8   r   r    r(   J  s   

zZoeDepthAttractorLayer.__init__NTc                 C   s  |dur|rt jj||jdd ddd}|| }| |}| |}| |}| |}|d }|j\}}}}	||| j	d||	}|dddddd	f }
t jj|||	fddd}| j
sxtjtjd
| j }|t|
d|d dd}n,tj||jd}t| j	D ]}|t|
dd|d	f d| 7 }q| jdkr|| j	 }|| }| j| j | | j }tj|dd\}}t|| j| j}||fS )ao  
        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
        and the attractor points (the latter are predicted by the MLP).

        Args:
            x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Feature block.
            prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
                Previous bin centers normed.
            prev_bin_embedding (`torch.Tensor`, *optional*):
                Optional previous bin embeddings.
            interpolate (`bool`, *optional*, defaults to `True`):
                Whether to interpolate the previous bin embeddings to the size of the input features.

        Returns:
            `Tuple[`torch.Tensor`, `torch.Tensor`]:
                New bin centers normed and scaled.
        Nru   Trx   rr   r   r&   r   .meanr   r   r;   devicer   )r   rz   r{   r?   r   r   r   r   r   r   r   r   r   r   r   r   rD   
zeros_liker   r`   r   r   sortZclip)r4   r   prev_binprev_bin_embeddingr{   
attractorsrI   r7   heightwidthZattractors_normedr   funcdelta_cr   bin_new_centersr   r   r    rM   i  s8   



"&

zZoeDepthAttractorLayer.forward)r   r   r   FNTr^   r   r   r8   r    r   I  s    r   c                       s0   e Zd Z				d
 fdd	Zddd	Z  ZS )ZoeDepthAttractorLayerUnnormedr   r   r   Tc           	         s   t    || _|| _|| _|| _|j| _|j| _|j	| _
|| _|j }}t||ddd| _tjdd| _t||ddd| _t | _dS )zL
        Attractor layer for bin centers. Bin centers are unbounded
        r   r   Tr   N)r'   r(   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   ri   r   r   r   r   r   r8   r   r    r(     s   

z'ZoeDepthAttractorLayerUnnormed.__init__Nc                 C   s,  |dur|rt jj||jdd ddd}|| }| |}| |}| |}| |}|jdd \}}t jj|||fddd}| js`t	j
t	jd| j }	|	t|d|d dd	}
n,t	j||jd
}
t| jD ]}|
t|dd|df d| 7 }
qm| jdkr|
| j }
||
 }|}||fS )a  
        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
        and the attractor points (the latter are predicted by the MLP).

        Args:
            x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
                Feature block.
            prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
                Previous bin centers normed.
            prev_bin_embedding (`torch.Tensor`, *optional*):
                Optional previous bin embeddings.
            interpolate (`bool`, *optional*, defaults to `True`):
                Whether to interpolate the previous bin embeddings to the size of the input features.

        Returns:
            `Tuple[`torch.Tensor`, `torch.Tensor`]:
                New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
        Nr   ru   Tr   r   r&   r   r;   r   .r   )r   rz   r{   r?   r   r   r   r   r   r   r   r   r   r   rD   r   r   r`   r   )r4   r   r   r   r{   r   r   r   r   r   r   r   r   r   r   r    rM     s.   



"&

z&ZoeDepthAttractorLayerUnnormed.forward)r   r   r   Tr   r^   r   r   r8   r    r     s    r   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	ZoeDepthProjector   c                    sD   t    t||ddd| _tjdd| _t||ddd| _dS )a  Projector MLP.

        Args:
            in_features (`int`):
                Number of input channels.
            out_features (`int`):
                Number of output channels.
            mlp_dim (`int`, *optional*, defaults to 128):
                Hidden dimension.
        r   r   Tr   N)r'   r(   r   rW   r   ri   r   r   )r4   r   out_featuresr   r8   r   r    r(     s   
zZoeDepthProjector.__init__rL   r:   c                 C   s"   |  |}| |}| |}|S r\   )r   r   r   r]   r   r   r    rM        


zZoeDepthProjector.forward)r   )r   r   r   r(   r   rO   rM   rP   r   r   r8   r    r     s    r   c                       st   e Zd ZdZ fddZdejdejfddZ			dd
ejdejdejdeej	 dee
 deej fddZ  ZS )ZoeDepthMultiheadAttentionzKEquivalent implementation of nn.MultiheadAttention with `batch_first=True`.c                    s   t    || dkrtd| d| d|| _t|| | _| j| j | _t|| j| _	t|| j| _
t|| j| _t||| _t|| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())r'   r(   r   num_attention_headsr[   attention_head_sizeall_head_sizer   r3   querykeyr   out_projDropoutdropout)r4   r6   r   r   r8   r   r    r(     s   
z#ZoeDepthMultiheadAttention.__init__r   r:   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr=   r   r&   r   r   )rw   r   r   r   rA   )r4   r   Znew_x_shaper   r   r    transpose_for_scores/  s   
z/ZoeDepthMultiheadAttention.transpose_for_scoresNFquerieskeysvaluesattention_maskoutput_attentionsc                 C   s   |  | |}|  | |}|  | |}t||dd}	|	t| j	 }	|d ur2|	| }	t
jj|	dd}
| |
}
t|
|}|dddd }| d d | jf }||}| |}|rm||
f}|S |f}|S )Nr=   r   r;   r   r&   r   r   )r   r   r   r   r   matmulZ	transposemathsqrtr   r   rz   r   r   rA   rB   rw   r   r   r   )r4   r   r   r   r   r   Zquery_layerZ	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr   r   r    rM   4  s$   


z"ZoeDepthMultiheadAttention.forward)NF)r   r   r   r   r(   r   rO   r   r   r   boolr   rM   rP   r   r   r8   r    r     s&    
r   c                       s6   e Zd Zd	 fdd	Z	d
deej fddZ  ZS )ZoeDepthTransformerEncoderLayer皙?reluc                    s   t    |j}|j}|j}t|||d| _t||| _	t
|| _t||| _t|| _t|| _t
|| _t
|| _t| | _d S )N)r   )r'   r(   patch_transformer_hidden_sizeZ#patch_transformer_intermediate_sizeZ%patch_transformer_num_attention_headsr   	self_attnr   r3   linear1r   r   linear2	LayerNormnorm1norm2dropout1dropout2r	   
activation)r4   r5   r   r  r6   Zintermediate_sizer   r8   r   r    r(   ]  s   
z(ZoeDepthTransformerEncoderLayer.__init__Nsrc_maskc              	   C   sn   | }}| j ||||dd }|| | }| |}| | | | |}|| | }| |}|S )N)r   r   r   r   r   )	r	  r  r  r  r   r  r
  r  r  )r4   srcr  r   r   Zsrc2r   r   r    rM   q  s   

z'ZoeDepthTransformerEncoderLayer.forward)r  r  r\   )	r   r   r   r(   r   r   rO   rM   rP   r   r   r8   r    r  \  s    r  c                       s4   e Zd Z fddZdejfddZdd Z  ZS )ZoeDepthPatchTransformerEncoderc                    sL   t     j}t fddt jD | _tj| j	dddd| _
dS )zViT-like transformer block

        Args:
            config (`ZoeDepthConfig`):
                Model configuration class defining the model architecture.
        c                    s   g | ]}t  qS r   )r  r   r7   r5   r   r    r         z<ZoeDepthPatchTransformerEncoder.__init__.<locals>.<listcomp>r   r   rT   N)r'   r(   r   r   r*   r`   Znum_patch_transformer_layerstransformer_encoderrW   r  embedding_convPxP)r4   r5   rR   r8   r  r    r(     s   
z(ZoeDepthPatchTransformerEncoder.__init__cpuc           
   	   C   s   t jd|||dd}t jd|d||dd}t |t t jd|d |  }|| }	t jt |	t |	gdd}	|	jdd	|dd}	|	S )zGenerate positional encodings

        Args:
            sequence_length (int): Sequence length
            embedding_dim (int): Embedding dimension

        Returns:
            torch.Tensor: Positional encodings.
        r   )dtyper   r   r&   g     @r   r;   )
r   r   rD   expr   r   r@   sincosrepeat)
r4   rI   rJ   embedding_dimr   r  positionindexZdiv_termZpos_encodingr   r   r    positional_encoding_1d  s   
$z6ZoeDepthPatchTransformerEncoder.positional_encoding_1dc                 C   st   |  |d}tj|d}|ddd}|j\}}}|| j||||j|j	d }t
dD ]	}| j| |}q.|S )zForward pass

        Args:
            x (torch.Tensor - NCHW): Input feature tensor

        Returns:
            torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
        r&   )r   r   r   r   )r   r  r   )r  rC   r   rz   r   rA   r?   r#  r   r  r`   r  )r4   r   Z
embeddingsrI   rJ   r   r   r   r   r    rM     s   	z'ZoeDepthPatchTransformerEncoder.forward)	r   r   r   r(   r   Zfloat32r#  rM   rP   r   r   r8   r    r    s    r  c                       s&   e Zd Zd fddZdd Z  ZS )ZoeDepthMLPClassifierr:   Nc                    s8   t    |}t||| _t | _t||| _d S r\   )r'   r(   r   r3   r
  ri   r  r  )r4   r   r   Zhidden_featuresr8   r   r    r(     s
   

zZoeDepthMLPClassifier.__init__c                 C   s"   |  |}| |}| |}|S r\   )r
  r  r  )r4   rL   r   r   r   r    rM     r   zZoeDepthMLPClassifier.forward)r:   Nr^   r   r   r8   r    r$    s    r$  c                       s(   e Zd ZdZ fddZdd Z  ZS )*ZoeDepthMultipleMetricDepthEstimationHeadszn
    Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
    c                    s  t    jjj| _j| _j}tj||dddd| _	t
| _tddd| _| jdkr5t n| jdkr<t tfd	d
jD | _t|d d| _tfddtdD | _t fdd
jD | _jtfdd
jD | _d S )Nr   r   rT   r   r&   r   r   r   softplusc              
      s4   i | ]}|d  t |d  d |d |d dqS )namer   r&   r   r   )r   r   r   r   )r   )r   confr   r5   r   r    
<dictcomp>  s    zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>r   r   r   c                    s    g | ]}t j  d  dqS )r&   r,  r   rk   r  r*  r   r    r     s    zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<listcomp>r   c              	      s8   i | ]  d  t  fddttD qS )r(  c                    s(   g | ]} | d  d dqS )r   r   r   r   r   r   r   r   )	Attractorr5   configurationr   r   r    r     s    zRZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>.<listcomp>)r   r*   r`   ra   )r   )r0  r5   r   )r1  r    r+  
  s    

c              
      s(   i | ]}|d  t  |d ddqS )r(  r   r   )r   )r   r   r1  )r   r5   last_inr   r    r+    s    )r'   r(   r   num_attractorsbin_configurationsr   r   r   rW   r   r  patch_transformerr$  mlp_classifierr   r   Z
ModuleDictseed_bin_regressorsr   seed_projectorr*   r`   
projectorsr   r   conditional_log_binomial)r4   r5   r   r8   )r0  r   r5   r3  r   r    r(     sJ   





z3ZoeDepthMultipleMetricDepthEstimationHeads.__init__c                    s  |  |}| |d d dd d f }| |}tj|jddddd}dd | jD }	|	tj|dd 	   z fdd| jD d }
W n t
yV   td	  d
w |
d }|
d }| j  }||\}}| jdv rx|| ||  }n|}| |}| j  }t| j||D ]\}}}||}||||dd\}}|}|}q|}tjj||jdd  ddd}tjj||jdd  ddd}| j  }|||}tj|| ddd}||fS )Nr   Tr   r=   r;   c                 S   s   g | ]}|d  qS r(  r   r2  r   r   r    r   5  r  zFZoeDepthMultipleMetricDepthEstimationHeads.forward.<locals>.<listcomp>c                    s   g | ]
}|d   kr|qS r<  r   )r   r5   Zbin_configurations_namer   r    r   9  s    zbin_configurations_name z! not found in bin_configurationssr   r   r   Zhybrid2r{   r   ru   r   r   )r   r6  r7  r   r   r   r5  Zargmaxr   item
IndexErrorr   r8  r   r9  r   r,   r:  r   rz   r{   r?   r;  )r4   outconv_activationr   feature_blocksrelative_depthr   Z	embeddingr   Zdomain_votenamesr)  r   r   seed_bin_regressorr7   seed_bin_centersr   r   r   	projector	attractorr   bin_embeddingbinr   lastr;  rK   r   r=  r    rM   )  s@   







z2ZoeDepthMultipleMetricDepthEstimationHeads.forwardr|   r   r   r8   r    r%    s    Tr%  c                       rQ   )!ZoeDepthMetricDepthEstimationHeadc                    s  t    jd }|d |d |d jjj}| _| _|| _j}t	j
||dddd| _| jdkr>t n| jdkrEt td	| _t|d
| _t	fddtdD | _t	 fddtdD | _jd }t|d| _d S )Nr   r   r   r   r   rT   r   r'  r.  r&  c                    s   g | ]	}t j d qS )r&  r-  r  r*  r   r    r   }  s    z>ZoeDepthMetricDepthEstimationHead.__init__.<locals>.<listcomp>r   c              	      s"   g | ]} | d qS ))r   r   r   r   r   r/  )r0  r5   r   r   r   r   r   r    r     s    )r   )r'   r(   r5  r   r4  r   r   r   r   r   rW   r   r   r   r   rF  r   r9  r*   r`   r:  r   r   r   r;  )r4   r5   Zbin_configurationr   r   r3  r8   )r0  r   r5   r   r   r   r   r    r(   ^  sL   




z*ZoeDepthMetricDepthEstimationHead.__init__c                 C   s2  |  |}| |\}}| jdv r|| j | j| j  }n|}| |}	t| j| j|D ]\}
}}|
|}||||	dd\}}|	 }|	 }	q,|}|
d}tjj||jdd  ddd}tj||gdd}tjj||jd	d  ddd
}| ||}tjj||jd	d  ddd
}tj|| ddd}|d fS )Nr>  Tr?  r   r&   ru   rv   r;   r   r   r   )r   rF  r   r   r   r9  r,   r:  r   clonerD   r   rz   r{   r?   r   r@   r;  r   )r4   rB  r   rC  rD  r   r7   rG  r   r   rH  rI  r   rJ  rK  r   rL  Zrelative_conditioningrK   r   r   r    rM     s,   




z)ZoeDepthMetricDepthEstimationHead.forwardr^   r   r   r8   r    rM  ]  s    ;rM  c                   @   s$   e Zd ZeZdZdZdZdd ZdS )ZoeDepthPreTrainedModelZzoedepthpixel_valuesTc                 C   sx   t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjr:|j
j  |jjd dS dS )zInitialize the weightsg        )r   ZstdNr   )r   r   r3   rW   rY   weightdataZnormal_r5   Zinitializer_rangere   Zzero_r  Zfill_)r4   moduler   r   r    _init_weights  s   
z%ZoeDepthPreTrainedModel._init_weightsN)	r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingrT  r   r   r   r    rO    s    rO  zU
    ZoeDepth model with one or multiple metric depth estimation head(s) on top.
    )Zcustom_introc                       sn   e Zd Z fddZe				ddejdeej dee	 dee	 dee	 d	e
eej ef fd
dZ  ZS )ZoeDepthForDepthEstimationc                    s   t  | t|| _t| jjdr&t| jjdr&| jjj|_| jjj| _nt	dt
|| _t|| _t|jdkr?t|nt|| _|   d S )Nr6   
patch_sizezXZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributesr   )r'   r(   r   backbonehasattrr5   r6   r1   rV  r   r}   neckr   relative_headra   r5  r%  rM  metric_headZ	post_init)r4   r5   r8   r   r    r(     s   


z#ZoeDepthForDepthEstimation.__init__NrP  labelsr   output_hidden_statesreturn_dictr:   c                 C   sR  d}|dur
t d|dur|n| jj}|dur|n| jj}|dur$|n| jj}| jj|||d}|j}|j\}	}	}
}| j	}|
| }|| }| 
|||\}}|g| }| |\}}|g| }| j|d |d |dd |d\}}|jdd}|s|dur||f|dd  }n	|f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
        >>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     source_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r]  r   r   r   r&   )rB  r   rC  rD  r;   )r   r   r   r   r   )NotImplementedErrorr5   Zuse_return_dictr]  r   rW  Zforward_with_filtered_kwargsZfeature_mapsr?   rV  rY  rZ  r[  r   r   r   r   )r4   rP  r\  r   r]  r^  r   r  r   r7   r   r   rV  rG   rH   r   rK   rD  Zmetric_depthr   r   r   r   r    rM     sF   -


z"ZoeDepthForDepthEstimation.forward)NNNN)r   r   r   r(   r   r   r   r   Z
LongTensorr  r   r   rO   r
   rM   rP   r   r   r8   r    rU    s(    rU  )r   )r   r&   );r   r  dataclassesr   typingr   r   r   r   r   Ztorch.utils.checkpointr   Zactivationsr	   Zmodeling_outputsr
   Zmodeling_utilsr   utilsr   r   r   Zutils.backbone_utilsr   Zconfiguration_zoedepthr   Z
get_loggerr   loggerr   Moduler"   r/   r_   rd   rb   r}   r   r   r   r   r   Zjitscriptfloatr[   r   r   r   r   r   r  r  r$  r%  rM  rO  rU  __all__r   r   r   r    <module>   s\   
!J?%6
,,C8_TE$? cy