o
    Zhq                  	   @   s  d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ eeZd:dejdededejfddZG dd de	j Z!G dd de	j Z"G dd de	j Z#G dd de	j Z$G dd de	j Z%G dd  d e	j Z&G d!d" d"e	j Z'G d#d$ d$e	j Z(G d%d& d&e	j Z)eG d'd( d(eZ*eG d)d* d*e*Z+G d+d, d,e	j Z,G d-d. d.e	j Z-G d/d0 d0e	j Z.G d1d2 d2e	j Z/G d3d4 d4e	j Z0ed5d6G d7d8 d8e*Z1g d9Z2dS );zPyTorch GLPN model.    N)ListOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputDepthEstimatorOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
GLPNConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr   Zrandom_tensoroutput r   U/var/www/auris/lib/python3.10/site-packages/transformers/models/glpn/modeling_glpn.py	drop_path$   s   
r    c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )GLPNDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                       t    || _d S N)super__init__r   )selfr   	__class__r   r   r%   <      

zGLPNDropPath.__init__hidden_statesc                 C   s   t || j| jS r#   )r    r   r   )r&   r*   r   r   r   forward@   s   zGLPNDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r&   r   r   r   
extra_reprC   s   zGLPNDropPath.extra_reprr#   )__name__
__module____qualname____doc__r   floatr%   r   Tensorr+   strr-   __classcell__r   r   r'   r   r!   9   s
    r!   c                       s(   e Zd ZdZ fddZdd Z  ZS )GLPNOverlapPatchEmbeddingsz+Construct the overlapping patch embeddings.c                    s4   t    tj|||||d d| _t|| _d S )N   kernel_sizestridepadding)r$   r%   r   Conv2dproj	LayerNorm
layer_norm)r&   
patch_sizer:   num_channelshidden_sizer'   r   r   r%   K   s   
z#GLPNOverlapPatchEmbeddings.__init__c                 C   s>   |  |}|j\}}}}|ddd}| |}|||fS )Nr7   r   )r=   r   flatten	transposer?   )r&   pixel_values
embeddings_heightwidthr   r   r   r+   W   s
   


z"GLPNOverlapPatchEmbeddings.forwardr.   r/   r0   r1   r%   r+   r5   r   r   r'   r   r6   H   s    r6   c                       s4   e Zd ZdZ fddZdd Z	d	ddZ  ZS )
GLPNEfficientSelfAttentionzSegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://arxiv.org/abs/2102.12122).c                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _t	| j| j| _
t	| j| j| _t	| j| j| _t|j| _|| _|dkrktj||||d| _t|| _d S d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r9   r:   )r$   r%   rB   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   LinearquerykeyvalueDropoutZattention_probs_dropout_probdropoutsr_ratior<   srr>   r?   r&   configrB   rM   sequence_reduction_ratior'   r   r   r%   f   s,   

z#GLPNEfficientSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r7   r   r   )sizerM   rP   viewpermute)r&   r*   Z	new_shaper   r   r   transpose_for_scores   s   
z/GLPNEfficientSelfAttention.transpose_for_scoresFc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r7   r]   dimr   )ra   rS   rX   r   r`   reshaperY   r?   rT   rU   r   matmulrD   mathsqrtrP   r   Z
functionalZsoftmaxrW   
contiguousr^   rQ   r_   )r&   r*   rH   rI   output_attentionsZquery_layer
batch_sizeseq_lenrA   Z	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr   r   r   r+      s*   




z"GLPNEfficientSelfAttention.forwardF)r.   r/   r0   r1   r%   ra   r+   r5   r   r   r'   r   rK   b   s    
rK   c                       s$   e Zd Z fddZdd Z  ZS )GLPNSelfOutputc                    s*   t    t||| _t|j| _d S r#   )r$   r%   r   rR   denserV   hidden_dropout_probrW   )r&   r[   rB   r'   r   r   r%      s   
zGLPNSelfOutput.__init__c                 C   s   |  |}| |}|S r#   )rp   rW   )r&   r*   Zinput_tensorr   r   r   r+      s   

zGLPNSelfOutput.forwardr.   r/   r0   r%   r+   r5   r   r   r'   r   ro      s    ro   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	GLPNAttentionc                    s6   t    t||||d| _t||d| _t | _d S )N)r[   rB   rM   r\   )rB   )r$   r%   rK   r&   ro   r   setpruned_headsrZ   r'   r   r   r%      s   
zGLPNAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rc   )lenr   r&   rM   rP   ru   r   rS   rT   rU   r   rp   rQ   union)r&   headsindexr   r   r   prune_heads   s   zGLPNAttention.prune_headsFc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r&   r   )r&   r*   rH   rI   rj   Zself_outputsattention_outputrm   r   r   r   r+      s   zGLPNAttention.forwardrn   )r.   r/   r0   r%   rz   r+   r5   r   r   r'   r   rs      s    rs   c                       &   e Zd Zd fdd	Zdd Z  ZS )
GLPNDWConv   c              	      s(   t    tj||dddd|d| _d S )Nr   r   T)biasgroups)r$   r%   r   r<   dwconv)r&   rd   r'   r   r   r%      s   
zGLPNDWConv.__init__c                 C   sD   |j \}}}|dd||||}| |}|ddd}|S )Nr   r7   )r   rD   r_   r   rC   )r&   r*   rH   rI   rk   rl   rA   r   r   r   r+      s
   
zGLPNDWConv.forward)r~   rr   r   r   r'   r   r}      s    r}   c                       r|   )
GLPNMixFFNNc                    sl   t    |p|}t||| _t|| _t|jt	r"t
|j | _n|j| _t||| _t|j| _d S r#   )r$   r%   r   rR   dense1r}   r   
isinstanceZ
hidden_actr4   r   intermediate_act_fndense2rV   rq   rW   )r&   r[   in_featureshidden_featuresZout_featuresr'   r   r   r%      s   

zGLPNMixFFN.__init__c                 C   sD   |  |}| |||}| |}| |}| |}| |}|S r#   )r   r   r   rW   r   )r&   r*   rH   rI   r   r   r   r+     s   




zGLPNMixFFN.forward)NNrr   r   r   r'   r   r      s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	GLPNLayerzCThis corresponds to the Block class in the original implementation.c                    sn   t    t|| _t||||d| _|dkrt|nt | _	t|| _
t|| }t|||d| _d S )N)rB   rM   r\   r   )r   r   )r$   r%   r   r>   layer_norm_1rs   	attentionr!   Identityr    layer_norm_2rO   r   mlp)r&   r[   rB   rM   r    r\   	mlp_ratioZmlp_hidden_sizer'   r   r   r%     s   
zGLPNLayer.__init__Fc           
      C   sr   | j | ||||d}|d }|dd  }| |}|| }| | |||}| |}|| }	|	f| }|S )N)rj   r   r   )r   r   r    r   r   )
r&   r*   rH   rI   rj   Zself_attention_outputsr{   rm   Z
mlp_outputZlayer_outputr   r   r   r+      s   


zGLPNLayer.forwardrn   rJ   r   r   r'   r   r     s    r   c                       s,   e Zd Z fddZ			dddZ  ZS )GLPNEncoderc           	         sX  t     | _dd tjd jt jddD }g }t j	D ]"}|
t j|  j| |dkr5 jn j|d   j| d q!t|| _g }d}t j	D ]@}g }|dkrd| j|d  7 }t j| D ]}|
t  j|  j| |||   j|  j| d qk|
t| qSt|| _t fd	dt j	D | _d S )
Nc                 S   s   g | ]}|  qS r   )item).0xr   r   r   
<listcomp>@  s    z(GLPNEncoder.__init__.<locals>.<listcomp>r   cpu)r   r   )r@   r:   rA   rB   )rB   rM   r    r\   r   c                    s   g | ]
}t  j| qS r   )r   r>   hidden_sizes)r   ir[   r   r   r   h  s    )r$   r%   r[   r   ZlinspaceZdrop_path_ratesumZdepthsrangeZnum_encoder_blocksappendr6   Zpatch_sizesstridesrA   r   r   
ModuleListpatch_embeddingsr   rM   Z	sr_ratiosZ
mlp_ratiosblockr?   )	r&   r[   ZdprrF   r   blockscurZlayersjr'   r   r   r%   ;  sH   
$


zGLPNEncoder.__init__FTc                 C   s   |rdnd }|r
dnd }|j d }|}tt| j| j| jD ]H\}	}
|
\}}}||\}}}t|D ]\}}|||||}|d }|rJ||d f }q2||}||||ddddd }|rf||f }q|sut	dd |||fD S t
|||d	S )
Nr   r   r   r]   r   r7   c                 s   s    | ]	}|d ur|V  qd S r#   r   )r   vr   r   r   	<genexpr>  s    z&GLPNEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater*   
attentions)r   	enumeratezipr   r   r?   re   r`   ri   tupler	   )r&   rE   rj   output_hidden_statesreturn_dictZall_hidden_statesZall_self_attentionsrk   r*   idxr   Zembedding_layerZblock_layerZ
norm_layerrH   rI   r   ZblkZlayer_outputsr   r   r   r+   k  s2   

 
zGLPNEncoder.forward)FFTrr   r   r   r'   r   r   :  s    3r   c                   @   s$   e Zd ZeZdZdZg Zdd ZdS )GLPNPreTrainedModelglpnrE   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjtjfr^|j	j
  |jjd dS dS )zInitialize the weightsr   )meanZstdNg      ?)r   r   rR   r<   weightdataZnormal_r[   Zinitializer_ranger   Zzero_Z	EmbeddingZpadding_idxr>   BatchNorm2dZfill_)r&   moduler   r   r   _init_weights  s   

z!GLPNPreTrainedModel._init_weightsN)	r.   r/   r0   r   Zconfig_classZbase_model_prefixZmain_input_nameZ_no_split_modulesr   r   r   r   r   r     s    r   c                       sd   e Zd Z fddZdd Ze			ddejdee	 dee	 d	ee	 d
e
eef f
ddZ  ZS )	GLPNModelc                    s(   t  | || _t|| _|   d S r#   )r$   r%   r[   r   encoder	post_initr&   r[   r'   r   r   r%     s   
zGLPNModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   rz   )r&   Zheads_to_pruner   rx   r   r   r   _prune_heads  s   zGLPNModel._prune_headsNrE   rj   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrj   r   r   r   r   r   )r[   rj   r   use_return_dictr   r	   r*   r   )r&   rE   rj   r   r   Zencoder_outputsZsequence_outputr   r   r   r+     s$   	zGLPNModel.forward)NNN)r.   r/   r0   r%   r   r   r   FloatTensorr   boolr   r   r	   r+   r5   r   r   r'   r   r     s$    

r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )GLPNSelectiveFeatureFusionz
    Selective Feature Fusion module, as explained in the [paper](https://arxiv.org/abs/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    @   c              	      s   t    ttjt|d |ddddt|t | _ttj|t|d ddddtt|d t | _	tjt|d ddddd| _
t | _d S )Nr7   r   r   )in_channelsout_channelsr9   r:   r;   )r$   r%   r   
Sequentialr<   rO   r   ReLUconvolutional_layer1convolutional_layer2convolutional_layer3ZSigmoidsigmoid)r&   Z
in_channelr'   r   r   r%     s   
z#GLPNSelectiveFeatureFusion.__init__c                 C   s   t j||fdd}| |}| |}| |}| |}||d d dd d d d f d ||d d dd d d d f d  }|S )Nr   rc   r   )r   catr   r   r   r   Z	unsqueeze)r&   Zlocal_featuresZglobal_featuresfeaturesZattnZhybrid_featuresr   r   r   r+     s   



(z"GLPNSelectiveFeatureFusion.forward)r   rJ   r   r   r'   r   r     s    r   c                       s&   e Zd Z fddZdddZ  ZS )GLPNDecoderStagec                    sP   t    ||k}|stj||ddnt | _t|| _tjdddd| _	d S )Nr   )r9   r7   bilinearFZscale_factormodeZalign_corners)
r$   r%   r   r<   r   convolutionr   fusionUpsampleupsample)r&   r   r   should_skipr'   r   r   r%     s
   

zGLPNDecoderStage.__init__Nc                 C   s,   |  |}|d ur| ||}| |}|S r#   )r   r   r   )r&   hidden_stateZresidualr   r   r   r+     s
   

zGLPNDecoderStage.forwardr#   rr   r   r   r'   r   r     s    r   c                       s:   e Zd Z fddZdeej deej fddZ  ZS )GLPNDecoderc                    s\   t    |jd d d }|j t fdd|D | _d | jd _tjdddd| _	d S )	Nr]   c                    s   g | ]}t | qS r   )r   )r   rB   r   r   r   r   )  s    z(GLPNDecoder.__init__.<locals>.<listcomp>r   r7   r   Fr   )
r$   r%   r   decoder_hidden_sizer   r   stagesr   r   final_upsample)r&   r[   Zreserved_hidden_sizesr'   r   r   r%   "  s   
zGLPNDecoder.__init__r*   r   c                 C   sN   g }d }t |d d d | jD ]\}}|||}|| q| ||d< |S )Nr]   )r   r   r   r   )r&   r*   Zstage_hidden_statesZstage_hidden_stater   Zstager   r   r   r+   0  s   
zGLPNDecoder.forward	r.   r/   r0   r%   r   r   r3   r+   r5   r   r   r'   r   r   !  s    &r   c                       r   )	SiLogLossz
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

          ?c                    r"   r#   )r$   r%   lambd)r&   r   r'   r   r   r%   E  r)   zSiLogLoss.__init__c                 C   sX   |dk  }t|| t||  }tt|d | jt| d  }|S )Nr   r7   )detachr   logrh   powr   r   )r&   predtargetZ
valid_maskZdiff_loglossr   r   r   r+   I  s   ,zSiLogLoss.forward)r   rJ   r   r   r'   r   r   <  s    r   c                       s6   e Zd Z fddZdeej dejfddZ  ZS )GLPNDepthEstimationHeadc                    sR   t    || _|j}ttj||ddddtjddtj|ddddd| _d S )Nr   r   r8   F)Zinplace)	r$   r%   r[   r   r   r   r<   r   head)r&   r[   Zchannelsr'   r   r   r%   R  s   


z GLPNDepthEstimationHead.__init__r*   r   c                 C   s8   || j j }| |}t|| j j }|jdd}|S )Nr   rc   )r[   Zhead_in_indexr   r   r   	max_depthZsqueeze)r&   r*   predicted_depthr   r   r   r+   ^  s
   
zGLPNDepthEstimationHead.forwardr   r   r   r'   r   r   Q  s    "r   zg
    GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
    )Zcustom_introc                       sn   e Zd Z fddZe				ddejdeej dee dee dee d	e	e
ej ef fd
dZ  ZS )GLPNForDepthEstimationc                    s6   t  | t|| _t|| _t|| _|   d S r#   )	r$   r%   r   r   r   decoderr   r   r   r   r'   r   r   r%   p  s
   


zGLPNForDepthEstimation.__init__NrE   labelsrj   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| j||d|d}|r"|jn|d }| |}| |}	d}
|dur>t }||	|}
|s`|rL|	f|dd  }n	|	f|dd  }|
dur^|
f| S |S t|
|	|rh|jnd|j	dS )a  
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NTr   r   r7   )r   r   r*   r   )
r[   r   r   r   r*   r   r   r   r
   r   )r&   rE   r   rj   r   r   rm   r*   outr   r   Zloss_fctr   r   r   r   r+   z  s6   .


zGLPNForDepthEstimation.forward)NNNN)r.   r/   r0   r%   r   r   r   r   r   r   r   r3   r
   r+   r5   r   r   r'   r   r   j  s(    
r   )r   r   r   r   )r   F)3r1   rg   typingr   r   r   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_outputsr	   r
   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   Zconfiguration_glpnr   Z
get_loggerr.   loggerr3   r2   r   r    Moduler!   r6   rK   ro   rs   r}   r   r   r   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   sJ   
 R'+X5,^