o
    Zh+R                  	   @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ eeZd0dejde de!dejfddZ"G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'G dd dej#Z(G d d! d!ej#Z)G d"d# d#ej#Z*eG d$d% d%eZ+eG d&d' d'e+Z,ed(d)G d*d+ d+e+Z-ed,d)G d-d. d.e+eZ.g d/Z/dS )1zPyTorch ConvNextV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin   )ConvNextV2Config        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr   Zrandom_tensoroutput r!   a/var/www/auris/lib/python3.10/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_path)   s   
r#   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r!   r"   r'   A   s   

zConvNextV2DropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r#   r   r   r(   r+   r!   r!   r"   forwardE   s   zConvNextV2DropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r(   r!   r!   r"   
extra_reprH   s   zConvNextV2DropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr-   strr/   __classcell__r!   r!   r)   r"   r$   >   s
    r$   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r&   r'   r   	Parameterr   zerosweightbias)r(   r9   r)   r!   r"   r'   O   s   
zConvNextV2GRN.__init__r+   r   c                 C   sF   t jj|dddd}||jdddd  }| j||  | j | }|S )N   )r   r>   T)ordr9   keepdim)r9   r@   ư>)r   ZlinalgZvector_normmeanr<   r=   )r(   r+   Zglobal_featuresZnorm_featuresr!   r!   r"   r-   T   s   zConvNextV2GRN.forward)
r0   r1   r2   r3   intr'   r   FloatTensorr-   r7   r!   r!   r)   r"   r8   L   s    r8   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  ZS )ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rB   channels_lastc                    s`   t    tt|| _tt|| _|| _	|| _
| j
dvr*td| j
 |f| _d S )N)rG   channels_firstzUnsupported data format: )r&   r'   r   r:   r   Zonesr<   r;   r=   epsdata_formatNotImplementedErrornormalized_shape)r(   rL   rI   rJ   r)   r!   r"   r'   d   s   

zConvNextV2LayerNorm.__init__xr   c                 C   s   | j dkrtjj|| j| j| j| j}|S | j dkr]|j	}|
 }|jddd}|| djddd}|| t|| j  }|j|d}| jd d d d f | | jd d d d f  }|S )NrG   rH   r   T)r@   r>   )r   )rJ   r   r   Z
functionalZ
layer_normrL   r<   r=   rI   r   r4   rC   powsqrtto)r(   rM   Zinput_dtypeusr!   r!   r"   r-   n   s   
	
,zConvNextV2LayerNorm.forward)rB   rG   )	r0   r1   r2   r3   r'   r   r5   r-   r7   r!   r!   r)   r"   rF   ^   s    
rF   c                       s6   e Zd ZdZ fddZdejdejfddZ  Z	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestriderB   rH   rI   rJ   )
r&   r'   r   Conv2dnum_channelshidden_sizesZ
patch_sizepatch_embeddingsrF   	layernormr(   configr)   r!   r"   r'      s   
zConvNextV2Embeddings.__init__pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rY   
ValueErrorr[   r\   )r(   r_   rY   
embeddingsr!   r!   r"   r-      s   



zConvNextV2Embeddings.forward
r0   r1   r2   r3   r'   r   rE   r5   r-   r7   r!   r!   r)   r"   rS   }   s    rS   c                       s8   e Zd ZdZd	 fdd	ZdejdejfddZ  Z	S )
ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| | _td| || _|dkrAt|| _d S t | _d S )N   r	   )rU   paddinggroupsrB   rI      r   )r&   r'   r   rX   dwconvrF   r\   Linearpwconv1r
   Z
hidden_actactr8   grnpwconv2r$   Identityr#   )r(   r^   r9   r#   r)   r!   r"   r'      s   
$zConvNextV2Layer.__init__r+   r   c                 C   sr   |}|  |}|dddd}| |}| |}| |}| |}| |}|dddd}|| | }|S )Nr   r>   r	   r   )ri   Zpermuter\   rk   rl   rm   rn   r#   )r(   r+   r   rM   r!   r!   r"   r-      s   





zConvNextV2Layer.forward)r   rb   r!   r!   r)   r"   rc      s    rc   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  Z	S )ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    r>   Nc              	      s   t    |ks|dkr!tt|dddtj|||d| _nt | _p,dg| tj fddt|D  | _	d S )	Nr   rB   rH   rW   rT   r   c                    s   g | ]}t  | d qS ))r9   r#   )rc   ).0jr^   drop_path_ratesout_channelsr!   r"   
<listcomp>   s    z,ConvNextV2Stage.__init__.<locals>.<listcomp>)
r&   r'   r   Z
SequentialrF   rX   downsampling_layerro   rangelayers)r(   r^   in_channelsru   rU   rV   depthrt   r)   rs   r"   r'      s   


zConvNextV2Stage.__init__r+   r   c                 C   s   |  |}| |}|S r%   )rw   ry   r,   r!   r!   r"   r-      s   

zConvNextV2Stage.forward)r>   r>   r>   Nrb   r!   r!   r)   r"   rp      s    
rp   c                       sN   e Zd Z fddZ		ddejdee dee dee	e
f fd	d
Z  ZS )ConvNextV2Encoderc              	      s   t    t | _dd tjd|jt|j	dd
|j	D }|jd }t|jD ]$}|j| }t||||dkr;dnd|j	| || d}| j| |}q*d S )	Nc                 S   s   g | ]}|  qS r!   )tolist)rq   rM   r!   r!   r"   rv      s    z.ConvNextV2Encoder.__init__.<locals>.<listcomp>r   cpu)r   r>   r   )rz   ru   rV   r{   rt   )r&   r'   r   Z
ModuleListstagesr   ZlinspaceZdrop_path_ratesumZdepthssplitrZ   rx   Z
num_stagesrp   append)r(   r^   rt   Zprev_chsiZout_chsstager)   r!   r"   r'      s&   

 

zConvNextV2Encoder.__init__FTr+   output_hidden_statesreturn_dictr   c                 C   sj   |rdnd }t | jD ]\}}|r||f }||}q|r"||f }|s/tdd ||fD S t||dS )Nr!   c                 s   s    | ]	}|d ur|V  qd S r%   r!   )rq   vr!   r!   r"   	<genexpr>
  s    z,ConvNextV2Encoder.forward.<locals>.<genexpr>)last_hidden_stater+   )	enumerater   tupler   )r(   r+   r   r   Zall_hidden_statesr   Zlayer_moduler!   r!   r"   r-      s   


zConvNextV2Encoder.forward)FT)r0   r1   r2   r'   r   rE   r   boolr   r   r   r-   r7   r!   r!   r)   r"   r|      s    
r|   c                   @   s&   e Zd ZeZdZdZdgZdd ZdS )ConvNextV2PreTrainedModel
convnextv2r_   rc   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjtfr:|j	j
  |jjd dS t |trM|jj
  |j	j
  dS dS )zInitialize the weightsr   )rC   ZstdNg      ?)
isinstancer   rj   rX   r<   dataZnormal_r^   Zinitializer_ranger=   Zzero_	LayerNormrF   Zfill_r8   )r(   moduler!   r!   r"   _init_weights  s   

z'ConvNextV2PreTrainedModel._init_weightsN)	r0   r1   r2   r   Zconfig_classZbase_model_prefixZmain_input_nameZ_no_split_modulesr   r!   r!   r!   r"   r     s    r   c                       sX   e Zd Z fddZe			d
deej dee dee de	e
ef fdd	Z  ZS )ConvNextV2Modelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )NrA   rg   )r&   r'   r^   rS   ra   r|   encoderr   r   rZ   Zlayer_norm_epsr\   	post_initr]   r)   r!   r"   r'   ,  s   

zConvNextV2Model.__init__Nr_   r   r   r   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}|d }| |ddg}|sC||f|dd   S t|||j	dS )Nz You have to specify pixel_valuesr   r   r   rA   r   )r   pooler_outputr+   )
r^   r   use_return_dictr`   ra   r   r\   rC   r   r+   )r(   r_   r   r   embedding_outputZencoder_outputsr   pooled_outputr!   r!   r"   r-   9  s(   
zConvNextV2Model.forward)NNN)r0   r1   r2   r'   r   r   r   rE   r   r   r   r   r-   r7   r!   r!   r)   r"   r   )  s    
r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sd   e Zd Z fddZe				ddeej deej dee	 dee	 de
eef f
d	d
Z  ZS ) ConvNextV2ForImageClassificationc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rA   )r&   r'   
num_labelsr   r   r   rj   rZ   ro   
classifierr   r]   r)   r!   r"   r'   g  s   
$z)ConvNextV2ForImageClassification.__init__Nr_   labelsr   r   r   c                 C   sb  |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtj	ksG|jtj
krLd| j _nd| j _| j jdkrnt }	| jdkrh|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrA   r>   )losslogitsr+   )r^   r   r   r   r   Zproblem_typer   r   r   longrD   r   Zsqueezer   viewr   r   r+   )r(   r_   r   r   r   outputsr   r   r   Zloss_fctr    r!   r!   r"   r-   u  s>   


"


z(ConvNextV2ForImageClassification.forward)NNNN)r0   r1   r2   r'   r   r   r   rE   Z
LongTensorr   r   r   r   r-   r7   r!   r!   r)   r"   r   _  s$    
r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                
       sJ   e Zd Z fddZe		d
dejdee dee de	fdd	Z
  ZS )ConvNextV2Backbonec                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< q)t|| _|   d S )Nr   rH   )rJ   )r&   r'   Z_init_backbonerS   ra   r|   r   rZ   Znum_featureszipZ_out_featuresZchannelsrF   r   Z
ModuleDicthidden_states_normsr   )r(   r^   r   r   rY   r)   r!   r"   r'     s   

zConvNextV2Backbone.__init__Nr_   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| |}| j|d|d}|r&|jn|d }d}t| j|D ]\}}	|| jv rG| j	| |	}	||	f7 }q2|sV|f}
|rT|
|f7 }
|
S t
||r_|ddS dddS )ar  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r   r!   )feature_mapsr+   Z
attentions)r^   r   r   ra   r   r+   r   Zstage_namesZout_featuresr   r   )r(   r_   r   r   r   r   r+   r   r   Zhidden_stater    r!   r!   r"   r-     s:   



zConvNextV2Backbone.forward)NN)r0   r1   r2   r'   r   r   r5   r   r   r   r-   r7   r!   r!   r)   r"   r     s    r   )r   r   r   r   )r   F)0r3   typingr   r   r   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   Zutils.backbone_utilsr   Zconfiguration_convnextv2r   Z
get_loggerr0   loggerr5   r4   r   r#   Moduler$   r8   rF   rS   rc   rp   r|   r   r   r   r   __all__r!   r!   r!   r"   <module>   sD   
 ,!04FM