a
    hA                  	   @   s  d dl mZ d dlmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ ddlmZ ddlmZ d	d
lmZmZmZ d	dlmZ d	dlmZmZ g dZG dd de
jZG dd de
jZG dd de
jZG dd de
j Z!G dd de
j Z"G dd de
jZ#G dd de
jZ$G dd de
j Z%e&ee!e"f  ee&eeeef   e'e( ede
j f ee e)ee%dd d!Z*d"ed#d$d%Z+G d&d' d'eZ,G d(d) d)eZ-G d*d+ d+eZ.e ed,e,j/fd-dd.d/ee, e)ee%d0d1d2Z0e ed,e-j/fd-dd.d/ee- e)ee%d0d3d4Z1e ed,e.j/fd-dd.d/ee. e)ee%d0d5d6Z2d	d7lm3Z3 e3e,j/j4e-j/j4e.j/j4d8Z5dS )9    )Sequence)partial)AnyCallableOptionalUnionN)Tensor   )VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)VideoResNetR3D_18_WeightsMC3_18_WeightsR2Plus1D_18_Weightsr3d_18mc3_18r2plus1d_18c                       sP   e Zd Zd	eeee eedd fddZeeeeeef dddZ  Z	S )
Conv3DSimpleN   	in_planes
out_planes	midplanesstridepaddingreturnc                    s   t  j||d||dd d S )N)r	   r	   r	   FZin_channelsZout_channelskernel_sizer    r!   biassuper__init__selfr   r   r   r    r!   	__class__ M/var/www/auris/lib/python3.9/site-packages/torchvision/models/video/resnet.pyr(      s    zConv3DSimple.__init__r    r"   c                 C   s
   | | | fS Nr-   r    r-   r-   r.   get_downsample_stride(   s    z"Conv3DSimple.get_downsample_stride)Nr   r   
__name__
__module____qualname__intr   r(   staticmethodtupler2   __classcell__r-   r-   r+   r.   r      s    r   c                       sL   e Zd Zd	eeeeedd fddZeeeeeef dddZ  ZS )
Conv2Plus1Dr   Nr   c                    s`   t  tj||dd||fd||fddt|tjddtj||d|ddf|ddfdd d S )	Nr   r	   r	   r   r   Fr$   r    r!   r%   TZinplacer	   r   r   r'   r(   nnConv3dBatchNorm3dReLUr)   r+   r-   r.   r(   .   s    
zConv2Plus1D.__init__r/   c                 C   s
   | | | fS r0   r-   r1   r-   r-   r.   r2   ?   s    z!Conv2Plus1D.get_downsample_stride)r   r   )	r4   r5   r6   r7   r(   r8   r9   r2   r:   r-   r-   r+   r.   r;   -   s   r;   c                       sP   e Zd Zd	eeee eedd fddZeeeeeef dddZ  Z	S )
Conv3DNoTemporalNr   r   c                    s(   t  j||dd||fd||fdd d S )Nr<   r   r   Fr#   r&   r)   r+   r-   r.   r(   E   s    zConv3DNoTemporal.__init__r/   c                 C   s
   d| | fS Nr   r-   r1   r-   r-   r.   r2   R   s    z&Conv3DNoTemporal.get_downsample_stride)Nr   r   r3   r-   r-   r+   r.   rE   D   s    rE   c                       sR   e Zd ZdZd
eeedejf eeej dd fddZ	e
e
ddd	Z  ZS )
BasicBlockr   N.inplanesplanesconv_builderr    
downsampler"   c                    s   || d d d |d d d|   }t    t|||||t|tjdd| _t||||t|| _tjdd| _|| _	|| _
d S )Nr	   Tr>   )r'   r(   rA   
SequentialrC   rD   conv1conv2relurL   r    r*   rI   rJ   rK   r    rL   r   r+   r-   r.   r(   [   s    (
zBasicBlock.__init__xr"   c                 C   sB   |}|  |}| |}| jd ur,| |}||7 }| |}|S r0   )rN   rO   rL   rP   r*   rS   Zresidualoutr-   r-   r.   forwardn   s    




zBasicBlock.forward)r   Nr4   r5   r6   	expansionr7   r   rA   Moduler   r(   r   rV   r:   r-   r-   r+   r.   rG   W   s     rG   c                       sR   e Zd ZdZdeeedejf eeej dd fddZ	e
e
dd	d
Z  ZS )
Bottleneck   r   N.rH   c                    s   t    || d d d |d d d|   }ttj||dddt|tjdd| _t|||||t|tjdd| _ttj||| j	 dddt|| j	 | _
tjdd| _|| _|| _d S )Nr	   r   F)r$   r%   Tr>   )r'   r(   rA   rM   rB   rC   rD   rN   rO   rX   conv3rP   rL   r    rQ   r+   r-   r.   r(      s    	
("zBottleneck.__init__rR   c                 C   sL   |}|  |}| |}| |}| jd ur6| |}||7 }| |}|S r0   )rN   rO   r\   rL   rP   rT   r-   r-   r.   rV      s    





zBottleneck.forward)r   NrW   r-   r-   r+   r.   rZ   |   s     rZ   c                       s&   e Zd ZdZdd fddZ  ZS )	BasicStemz$The default conv-batchnorm-relu stemNr"   c              
      s4   t  tjdddddddtdtjdd	 d S )
Nr	   @   )r	      r`   r   r   r   r<   Fr=   Tr>   r@   r*   r+   r-   r.   r(      s
    
zBasicStem.__init__r4   r5   r6   __doc__r(   r:   r-   r-   r+   r.   r]      s   r]   c                       s&   e Zd ZdZdd fddZ  ZS )R2Plus1dStemzRR(2+1)D stem is different than the default one as it uses separated 3D convolutionNr^   c                    sZ   t  tjdddddddtdtjdd	tjdd
dddddtd
tjdd	 d S )Nr	   -   )r   r`   r`   ra   )r   r	   r	   Fr=   Tr>   r_   r?   r   r   r   )r   r   r   r@   rb   r+   r-   r.   r(      s    

zR2Plus1dStem.__init__rc   r-   r-   r+   r.   re      s   re   c                	       s   e Zd Zdeeeef  eeeee	e
f   ee edejf eedd fddZeedd	d
Zdeeeef  eeee	e
f  eeeejdddZ  ZS )r     F.N)blockconv_makerslayersstemnum_classeszero_init_residualr"   c                    s  t    t|  d| _| | _| j||d d|d dd| _| j||d d|d dd| _| j||d d|d dd| _| j||d d	|d dd| _	t
d
| _t
d	|j || _|  D ]}t|t
jrt
jj|jddd |jdurbt
j|jd qt|t
jr4t
j|jd t
j|jd qt|t
jrt
j|jdd t
j|jd q|r|  D ]$}t|trrt
j|jjd qrdS )a^  Generic resnet video generator.

        Args:
            block (Type[Union[BasicBlock, Bottleneck]]): resnet building block
            conv_makers (List[Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]]): generator
                function for each layer
            layers (List[int]): number of blocks per layer
            stem (Callable[..., nn.Module]): module specifying the ResNet stem.
            num_classes (int, optional): Dimension of the final FC layer. Defaults to 400.
            zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False.
        r_   r   r   r1      r      r	   i   rg   Zfan_outrP   )modeZnonlinearityNg{Gz?)r'   r(   r   rI   rl   _make_layerlayer1layer2layer3layer4rA   ZAdaptiveAvgPool3davgpoolZLinearrX   fcmodules
isinstancerB   initZkaiming_normal_Zweightr%   Z	constant_rC   Znormal_rZ   Zbn3)r*   ri   rj   rk   rl   rm   rn   mr+   r-   r.   r(      s2    
zVideoResNet.__init__rR   c                 C   sT   |  |}| |}| |}| |}| |}| |}|d}| |}|S rF   )rl   rs   rt   ru   rv   rw   flattenrx   )r*   rS   r-   r-   r.   rV      s    







zVideoResNet.forwardr   )ri   rK   rJ   blocksr    r"   c           
   	   C   s   d }|dks| j ||j krV||}ttj| j ||j d|ddt||j }g }||| j |||| ||j | _ td|D ]}	||| j || qtj| S )Nr   F)r$   r    r%   )	rI   rX   r2   rA   rM   rB   rC   appendrange)
r*   ri   rK   rJ   r~   r    rL   Z	ds_striderk   ir-   r-   r.   rr   
  s    
zVideoResNet._make_layer)rh   F)r   )r4   r5   r6   typer   rG   rZ   r   r   rE   r;   listr7   r   rA   rY   boolr(   r   rV   rM   rr   r:   r-   r-   r+   r.   r      s*     4 r   .)ri   rj   rk   rl   weightsprogresskwargsr"   c                 K   sT   |d urt |dt|jd  t| |||fi |}|d urP||j|dd |S )Nrm   
categoriesT)r   Z
check_hash)r   lenmetar   Zload_state_dictZget_state_dict)ri   rj   rk   rl   r   r   r   modelr-   r-   r.   _video_resnet$  s    	r   )r   r   zKhttps://github.com/pytorch/vision/tree/main/references/video_classificationzThe weights reproduce closely the accuracy of the paper. The accuracies are estimated on video-level with parameters `frame_rate=15`, `clips_per_video=5`, and `clip_len=16`.)Zmin_sizer   ZrecipeZ_docsc                	   @   sD   e Zd Zedeedddi eddddd	id
dddZeZdS )r   z7https://download.pytorch.org/models/r3d_18-b3b3357e.pthp   r   ro      Z	crop_sizeZresize_sizeiP5Kinetics-400gO@g-T@zacc@1zacc@5gK7YD@g"_@Z
num_paramsZ_metricsZ_ops
_file_sizeurlZ
transformsr   N	r4   r5   r6   r   r   r
   _COMMON_METAKINETICS400_V1DEFAULTr-   r-   r-   r.   r   C  s"   r   c                	   @   sD   e Zd Zedeedddi eddddd	id
dddZeZdS )r   z7https://download.pytorch.org/models/mc3_18-a90a0ba3.pthr   r   r   iPu r   g{GO@gQU@r   gClE@gtVF@r   r   Nr   r-   r-   r-   r.   r   W  s"   r   c                	   @   sD   e Zd Zedeedddi eddddd	id
dddZeZdS )r   z<https://download.pytorch.org/models/r2plus1d_18-91a641e6.pthr   r   r   ir   gʡP@g33333U@r   gOnBD@g1Z^@r   r   Nr   r-   r-   r-   r.   r   k  s"   r   Z
pretrained)r   T)r   r   )r   r   r   r"   c                 K   s.   t | } tttgd g dt| |fi |S )a  Construct 18 layer Resnet3D model.

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.R3D_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.R3D_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.R3D_18_Weights
        :members:
    r[   r   r   r   r   )r   verifyr   rG   r   r]   r   r   r   r-   r-   r.   r     s    
r   c                 K   s4   t | } tttgtgd  g dt| |fi |S )a  Construct 18 layer Mixed Convolution network as in

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.MC3_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MC3_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MC3_18_Weights
        :members:
    r	   r   )r   r   r   rG   r   rE   r]   r   r-   r-   r.   r     s    
r   c                 K   s.   t | } tttgd g dt| |fi |S )a  Construct 18 layer deep R(2+1)D network as in

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.R2Plus1D_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.R2Plus1D_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.R2Plus1D_18_Weights
        :members:
    r[   r   )r   r   r   rG   r;   re   r   r-   r-   r.   r     s    
r   )
_ModelURLs)r   r   r   )6collections.abcr   	functoolsr   typingr   r   r   r   Ztorch.nnrA   Ztorchr   Ztransforms._presetsr
   utilsr   Z_apir   r   r   Z_metar   _utilsr   r   __all__rB   r   rM   r;   rE   rY   rG   rZ   r]   re   r   r   r   r7   r   r   r   r   r   r   r   r   r   r   r   r   Z
model_urlsr-   r-   r-   r.   <module>   sf   %1_$#$#$$