a
    «º”hî€  ã                	   @   s¾  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZ g d¢ZeG dd„ dƒƒZ ee! e!dœdd„Z"ej#e!e!e$ej#e!f dœdd„Z%ej#e!e!e!ej#dœdd„Z&ej' (d¡ ej' (d¡ G dd„ dej)ƒZ*ej#e!ej#dœdd„Z+ej#ej#e$e!e!e!f e$e!e!e!f ej#ej#ej#ej#dœd d!„Z,ej#ej#e-d"œd#d$„Z.ej' (d!¡ ej' (d$¡ G d%d&„ d&ej)ƒZ/G d'd(„ d(ej)ƒZ0G d)d*„ d*ej)ƒZ1G d+d,„ d,ej)ƒZ2e3e  e4e
e e-ee2d-œd.d/„Z5G d0d1„ d1eƒZ6G d2d3„ d3eƒZ7eƒ ed4e6j8fd5dd6d7œe
e6 e-ee2d8œd9d:„ƒƒZ9eƒ ed4e7j8fd5dd6d7œe
e7 e-ee2d8œd;d<„ƒƒZ:dS )=é    N)ÚSequence)Ú	dataclass)Úpartial)ÚAnyÚCallableÚOptionalé   )ÚMLPÚStochasticDepth)ÚVideoClassification)Ú_log_api_usage_onceé   )Úregister_modelÚWeightsÚWeightsEnum)Ú_KINETICS400_CATEGORIES)Ú_ovewrite_named_paramÚhandle_legacy_interface)ÚMViTÚMViT_V1_B_WeightsÚ	mvit_v1_bÚMViT_V2_S_WeightsÚ	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	ÚMSBlockConfigÚ	num_headsÚinput_channelsÚoutput_channelsÚkernel_qÚ	kernel_kvÚstride_qÚ	stride_kvN)Ú__name__Ú
__module__Ú__qualname__ÚintÚ__annotations__Úlist© r'   r'   úK/var/www/auris/lib/python3.9/site-packages/torchvision/models/video/mvit.pyr      s   
r   )ÚsÚreturnc                 C   s   d}| D ]}||9 }q|S ©Né   r'   )r)   ÚproductÚvr'   r'   r(   Ú_prod'   s    
r/   )ÚxÚ
target_dimÚ
expand_dimr*   c                 C   s@   |   ¡ }||d kr |  |¡} n||kr8td| j› ƒ‚| |fS )Nr,   zUnsupported input dimension )ÚdimÚ	unsqueezeÚ
ValueErrorÚshape©r0   r1   r2   Ú
tensor_dimr'   r'   r(   Ú
_unsqueeze.   s    r9   )r0   r1   r2   r8   r*   c                 C   s   ||d kr|   |¡} | S r+   )Úsqueezer7   r'   r'   r(   Ú_squeeze7   s    
r;   c                       sl   e Zd Zd	ejeej eej eddœ‡ fdd„Zej	e
eeef e
ej	e
eeef f dœdd„Z‡  ZS )
ÚPoolNF)ÚpoolÚnormÚ
activationÚnorm_before_poolr*   c                    sV   t ƒ  ¡  || _g }|d ur&| |¡ |d ur8| |¡ |rFtj|Ž nd | _|| _d S )N)ÚsuperÚ__init__r=   ÚappendÚnnÚ
SequentialÚnorm_actr@   )Úselfr=   r>   r?   r@   Úlayers©Ú	__class__r'   r(   rB   B   s    


zPool.__init__©r0   Úthwr*   c                 C   sö   t |ddƒ\}}tj|ddd\}}| dd¡}|jd d… \}}}| || |f| ¡ ¡ }| jrx| jd urx|  |¡}|  	|¡}|jdd … \}}	}
| |||d¡ dd¡}tj
||fdd}| jsÚ| jd urÚ|  |¡}t|dd|ƒ}|||	|
ffS )	Né   r,   ©r,   r   )Úindicesr3   r   éÿÿÿÿ©r3   )r9   ÚtorchZtensor_splitÚ	transposer6   ÚreshapeÚ
contiguousr@   rF   r=   Úcatr;   )rG   r0   rL   r8   Úclass_tokenÚBÚNÚCÚTÚHÚWr'   r'   r(   ÚforwardS   s    


zPool.forward)NF)r!   r"   r#   rD   ÚModuler   ÚboolrB   rR   ÚTensorÚtupler$   r^   Ú__classcell__r'   r'   rI   r(   r<   A   s     ûúr<   )Ú	embeddingÚdr*   c                 C   s@   | j d |kr| S tjj|  dd¡ d¡|dd d¡ dd¡S )Nr   r,   Zlinear)ÚsizeÚmode)r6   rD   Z
functionalZinterpolateÚpermuter4   r:   )rd   re   r'   r'   r(   Ú_interpolatem   s    ýûúÿri   )ÚattnÚqÚq_thwÚk_thwÚ	rel_pos_hÚ	rel_pos_wÚ	rel_pos_tr*   c           %      C   s6  |\}}}	|\}
}}t dt||ƒ d ƒ}t dt|	|ƒ d ƒ}t dt||
ƒ d ƒ}t|| dƒ}t|| dƒ}t |¡d d …d f | t |¡d d d …f d|  |  }t||	 dƒ}t|	| dƒ}t |	¡d d …d f | t |¡d d d …f d|  |  }t|
| dƒ}t||
 dƒ}t |¡d d …d f | t |
¡d d d …f d|
  |  }t||ƒ}t||ƒ}t||ƒ}|| ¡  }|| ¡  }|| ¡  }|j\}}}}|d d …d d …dd …f  |||||	|¡} t d| |¡}!t d| |¡}"|  	dddddd	¡ ||| | |	 |¡} t 
| | dd¡¡ dd¡}#|# ||||	||
¡ 	dddddd	¡}#|!d d …d d …d d …d d …d d …d d d …d f |"d d …d d …d d …d d …d d …d d d d …f  |#d d …d d …d d …d d …d d …d d …d d f   |||| |	 |
| | ¡}$| d d …d d …dd …dd …f  |$7  < | S )
Nr   r,   ç      ð?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rM   é   )r$   ÚmaxrR   Zarangeri   Úlongr6   rT   Zeinsumrh   ÚmatmulrS   Úview)%rj   rk   rl   rm   rn   ro   rp   Zq_tZq_hZq_wZk_tZk_hZk_wZdhZdwÚdtZ	q_h_ratioZ	k_h_ratioZdist_hZ	q_w_ratioZ	k_w_ratioZdist_wZ	q_t_ratioZ	k_t_ratioZdist_tZRhZRwZRtrX   Zn_headÚ_r3   Zr_qZrel_h_qZrel_w_qZrel_q_tZrel_posr'   r'   r(   Ú_add_rel_pos|   sJ    


<<<


**$..ÿ.þÿü(ry   ©r0   ZshortcutÚresidual_with_cls_embedc              	   C   sX   |r|   |¡ nD| d d …d d …dd …d d …f  |d d …d d …dd …d d …f 7  < | S r+   )Úadd_rz   r'   r'   r(   Ú_add_shortcut¸   s    Dr}   c                       s”   e Zd Zdejfee eeeee ee ee ee eeeee	dej
f ddœ‡ fdd„Zejeeeef eejeeeef f dœdd	„Z‡  ZS )
ÚMultiscaleAttentionç        .N)Ú
input_sizeÚ	embed_dimÚ
output_dimr   r   r   r   r    Úresidual_poolr{   Úrel_pos_embedÚdropoutÚ
norm_layerr*   c              
      st  t ƒ  ¡  || _|| _|| _|| | _dt | j¡ | _|	| _	|
| _
t |d| ¡| _t ||¡g}|dkr€| tj|dd¡ tj|Ž | _d | _t|ƒdksªt|ƒdkrädd„ |D ƒ}ttj| j| j|||| jd	d
|| jƒƒ| _d | _d | _t|ƒdkst|ƒdkrrdd„ |D ƒ}ttj| j| j|||| jd	d
|| jƒƒ| _ttj| j| j|||| jd	d
|| jƒƒ| _d | _d | _d | _|rpt|dd … ƒ}t|ƒdkr´||d  n|}t|ƒdkrÒ||d  n|}dt||ƒ d }d|d  d }t t || j¡¡| _t t || j¡¡| _t t || j¡¡| _tj j!| jdd tj j!| jdd tj j!| jdd d S )Nrq   r   r   T©Úinplacer,   c                 S   s   g | ]}t |d  ƒ‘qS ©r   ©r$   )Ú.0rk   r'   r'   r(   Ú
<listcomp>æ   ó    z0MultiscaleAttention.__init__.<locals>.<listcomp>F)ÚstrideÚpaddingÚgroupsÚbiasc                 S   s   g | ]}t |d  ƒ‘qS r‰   rŠ   )r‹   Úkvr'   r'   r(   rŒ   ÷   r   r   r   ç{®Gáz”?©Ústd)"rA   rB   r   r‚   r   Úhead_dimÚmathÚsqrtÚscalerrƒ   r{   rD   ÚLinearÚqkvrC   ÚDropoutrE   ÚprojectÚpool_qr/   r<   ÚConv3dÚpool_kÚpool_vrn   ro   rp   rs   ÚlenÚ	ParameterrR   ÚzerosÚinitÚtrunc_normal_)rG   r€   r   r‚   r   r   r   r   r    rƒ   r{   r„   r…   r†   rH   Z	padding_qZ
padding_kvrf   Zq_sizeZkv_sizeZspatial_dimZtemporal_dimrI   r'   r(   rB   Å   sŽ    

ù	öù	öù	özMultiscaleAttention.__init__rK   c                 C   s<  |j \}}}|  |¡ ||d| j| j¡ dd¡jdd\}}}| jd urZ|  ||¡\}}	n|}	| jd urx|  ||¡d }| j	d ur’|  	||¡\}}t
 | j| | dd¡¡}
| jd urä| jd urä| jd urät|
|||	| j| j| jƒ}
|
jdd}
t
 |
|¡}| jrt||| jƒ | dd¡ |d| j¡}|  |¡}||fS )Nr   r,   r   rQ   r   rP   )r6   r›   rT   r   r–   rS   Zunbindr    r¡   rž   rR   ru   r™   rn   ro   rp   ry   Zsoftmaxrƒ   r}   r{   r‚   r   )rG   r0   rL   rX   rY   rZ   rk   Úkr.   rm   rj   r'   r'   r(   r^   !  s6    2


ù	
zMultiscaleAttention.forward)r!   r"   r#   rD   Ú	LayerNormr&   r$   r`   Úfloatr   r_   rB   rR   ra   rb   r^   rc   r'   r'   rI   r(   r~   Ä   s&   òñ\r~   c                       s~   e Zd Zddejfee eeeeee	e	e
dejf ddœ
‡ fdd„Zejeeeef eejeeeef f dœdd	„Z‡  ZS )
ÚMultiscaleBlockr   .N)
r€   Úcnfrƒ   r{   r„   Úproj_after_attnr…   Ústochastic_depth_probr†   r*   c
                    s  t ƒ  ¡  || _d | _t|jƒdkr\dd„ |jD ƒ}
dd„ |
D ƒ}ttj|
|j|dd ƒ| _|rf|j	n|j
}|	|j
ƒ| _|	|ƒ| _t| jtjƒ| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	ƒ| _d | _|j
|j	krt |j
|j	¡| _d S )
Nr,   c                 S   s    g | ]}|d kr|d  n|‘qS rN   r'   )r‹   r)   r'   r'   r(   rŒ   V  r   z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   s   g | ]}t |d  ƒ‘qS r‰   rŠ   )r‹   r§   r'   r'   r(   rŒ   W  r   )rŽ   r   )	r   r   r   r    r„   rƒ   r{   r…   r†   rM   )Zactivation_layerr…   rˆ   Úrow)rA   rB   r¬   Ú	pool_skipr/   r   r<   rD   Z	MaxPool3dr   r   Únorm1Únorm2Ú
isinstanceZBatchNorm1dÚneeds_transposalr~   r   r   r   r    rj   r	   ZGELUÚmlpr
   Ústochastic_depthr   rš   )rG   r€   r«   rƒ   r{   r„   r¬   r…   r­   r†   Zkernel_skipZpadding_skipZattn_dimrI   r'   r(   rB   E  sN    
ÿ
óûzMultiscaleBlock.__init__rK   c           	      C   sä   | j r |  | dd¡¡ dd¡n|  |¡}|  ||¡\}}| jd u sJ| jsN|n|  |¡}| jd u rf|n|  ||¡d }||  |¡ }| j r¤|  | dd¡¡ dd¡n|  |¡}| jd u s¾| jrÂ|n|  |¡}||  |  	|¡¡ |fS )Nr,   r   r   )
r³   r°   rS   rj   r   r¬   r¯   rµ   r±   r´   )	rG   r0   rL   Zx_norm1Zx_attnZthw_newZx_skipZx_norm2Zx_projr'   r'   r(   r^     s    **zMultiscaleBlock.forward)r!   r"   r#   rD   r¨   r&   r$   r   r`   r©   r   r_   rB   rR   ra   rb   r^   rc   r'   r'   rI   r(   rª   D  s    	öõ:rª   c                       sF   e Zd Zeeeef eeddœ‡ fdd„Zejejdœdd„Z	‡  Z
S )ÚPositionalEncodingN)Ú
embed_sizeÚspatial_sizeÚtemporal_sizer„   r*   c                    sŽ   t ƒ  ¡  || _|| _t t |¡¡| _d | _	d | _
d | _|sŠt t | jd | jd  |¡¡| _	t t | j|¡¡| _
t t |¡¡| _d S )Nr   r,   )rA   rB   r¸   r¹   rD   r£   rR   r¤   rW   Úspatial_posÚtemporal_posÚ	class_pos)rG   r·   r¸   r¹   r„   rI   r'   r(   rB     s    
$zPositionalEncoding.__init__©r0   r*   c                 C   s¼   | j  | d¡d¡ d¡}tj||fdd}| jd ur¸| jd ur¸| jd ur¸| jj	\}}tj
| j|dd}| | j d¡ | jdd¡ d|¡¡ tj| j d¡|fdd d¡}| |¡ |S )Nr   rP   r,   rQ   )rW   Úexpandrf   r4   rR   rV   rº   r»   r¼   r6   Zrepeat_interleaver|   r¹   rT   )rG   r0   rW   Zhw_sizer·   Zpos_embeddingr'   r'   r(   r^   ›  s    & 
zPositionalEncoding.forward)r!   r"   r#   r$   rb   r`   rB   rR   ra   r^   rc   r'   r'   rI   r(   r¶   Œ  s   "r¶   c                       sž   e Zd Zdeeef eee eeeeeeeee	e
dejf  e	e
dejf  eeeef eeeef eeeef dd	œ‡ fd
d„Zejejdœdd„Z‡  ZS )r   ç      à?r   é  N©r   é   rÂ   ©r   rM   rM   ©r,   r   r   .)r¸   r¹   Úblock_settingrƒ   r{   r„   r¬   r…   Úattention_dropoutr­   Únum_classesÚblockr†   Úpatch_embed_kernelÚpatch_embed_strideÚpatch_embed_paddingr*   c                    s*  t ƒ  ¡  t| ƒ t|ƒ}|dkr*tdƒ‚|du r6t}|du rLttjdd}tj	d|d j
|||d| _dd	„ t|f| | jjƒD ƒ}t|d j
|d
 |d f|d |d| _t ¡ | _t|ƒD ]^\}}|
| |d  }| j ||||||||	||d	¡ t|jƒdkrÀdd	„ t||jƒD ƒ}qÀ||d jƒ| _t tj|ddt |d j|¡¡| _|  ¡ D ]Æ}t|tjƒr®tjj|jdd t|tjƒr"|j dur"tj !|j d¡ ntt|tjƒrö|jdurØtj !|jd¡ |j dur"tj !|j d¡ n,t|tƒr^| "¡ D ]}tjj|dd q
q^dS )aÄ  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngíµ ÷Æ°>)Zepsr   )Zin_channelsZout_channelsZkernel_sizerŽ   r   c                 S   s   g | ]\}}|| ‘qS r'   r'   ©r‹   rf   rŽ   r'   r'   r(   rŒ   ì  r   z!MViT.__init__.<locals>.<listcomp>r,   r   )r·   r¸   r¹   r„   rq   )	r€   r«   rƒ   r{   r„   r¬   r…   r­   r†   c                 S   s   g | ]\}}|| ‘qS r'   r'   rÌ   r'   r'   r(   rŒ     r   rP   Tr‡   r“   r”   r   )#rA   rB   r   r¢   r5   rª   r   rD   r¨   rŸ   r   Ú	conv_projÚziprŽ   r¶   Úpos_encodingZ
ModuleListÚblocksÚ	enumeraterC   r   r   r>   rE   rœ   rš   ÚheadÚmodulesr²   r¥   r¦   Zweightr‘   Z	constant_Ú
parameters)rG   r¸   r¹   rÅ   rƒ   r{   r„   r¬   r…   rÆ   r­   rÇ   rÈ   r†   rÉ   rÊ   rË   Ztotal_stage_blocksr€   Zstage_block_idr«   Zsd_probÚmÚweightsrI   r'   r(   rB   ª  sv    )
ûü
÷ÿþzMViT.__init__r½   c                 C   sŠ   t |ddƒd }|  |¡}| d¡ dd¡}|  |¡}| jjf| jj }| jD ]}|||ƒ\}}qN|  |¡}|d d …df }|  	|¡}|S )Nrr   r   r   r,   )
r9   rÍ   ÚflattenrS   rÏ   r¹   r¸   rÐ   r>   rÒ   )rG   r0   rL   rÈ   r'   r'   r(   r^   "  s    




zMViT.forward)	r¿   r   r   rÀ   NNrÁ   rÃ   rÄ   )r!   r"   r#   rb   r$   r   r   r`   r©   r   r   rD   r_   rB   rR   ra   r^   rc   r'   r'   rI   r(   r   ©  s:   
         ï
îxr   )rÅ   r­   rÖ   ÚprogressÚkwargsr*   c                 K   sÚ   |d urbt |dt|jd ƒƒ |jd d |jd d ks>J ‚t |d|jd ƒ t |d|jd ƒ | dd	¡}| dd
¡}tf ||| | dd¡| dd¡| dd¡| dd¡|dœ|¤Ž}|d urÖ| |j|dd¡ |S )NrÇ   Ú
categoriesÚmin_sizer   r,   r¸   r¹   Úmin_temporal_size©éà   rÞ   é   rƒ   Fr{   Tr„   r¬   )r¸   r¹   rÅ   rƒ   r{   r„   r¬   r­   )rØ   Z
check_hash)r   r¢   ÚmetaÚpopr   Zload_state_dictZget_state_dict)rÅ   r­   rÖ   rØ   rÙ   r¸   r¹   Úmodelr'   r'   r(   Ú_mvit9  s,     



ø	÷rã   c                   @   sJ   e Zd Zedeedddddddedd	d
ddddœidddœ	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthrÝ   ©é   ©çÍÌÌÌÌÌÜ?rç   rç   ©çÍÌÌÌÌÌÌ?ré   ré   ©Z	crop_sizeZresize_sizeÚmeanr•   rß   zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdúœThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip¢.úKinetics-400gJ+‡žS@gh‘í|?eW@©zacc@1zacc@5gu“V¦Q@gœÄ °rxa@©	rÛ   rÜ   rÚ   ZrecipeZ_docsZ
num_paramsZ_metricsZ_opsÚ
_file_size©ÚurlZ
transformsrà   N©	r!   r"   r#   r   r   r   r   ÚKINETICS400_V1ÚDEFAULTr'   r'   r'   r(   r   Z  s2   ûþÿï÷r   c                   @   sJ   e Zd Zedeedddddddedd	d
ddddœidddœ	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthrÝ   rä   ræ   rè   rê   rß   zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdrì   irí   gœÄ °r0T@gÃõ(\ªW@rî   gu“VP@g?5^ºI|`@rï   rñ   Nró   r'   r'   r'   r(   r   {  s2   ûþÿï÷r   Z
pretrained)rÖ   T)rÖ   rØ   )rÖ   rØ   rÙ   r*   c                 K   sÚ  t  | ¡} g d¢g d¢g d¢g g d¢g g d¢g g g g g g g g g g g d¢g gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg g d¢g g d¢g g g g g g g g g g g d¢g gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gd	œ}g }tt|d
 ƒƒD ]Z}| t|d
 | |d | |d | |d | |d | |d | |d | d	¡ qVtf dd|dd| dd¡| |dœ|¤ŽS )a¿  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    ©r,   r   r   rM   rM   rM   rM   rM   rM   rM   rM   rM   rM   rM   é   r÷   ©é`   éÀ   rú   é€  rû   rû   rû   rû   rû   rû   rû   rû   rû   rû   é   rü   )rú   rú   rû   rû   rû   rû   rû   rû   rû   rû   rû   rû   rû   rü   rü   rü   ©r   r   r   ©r,   r   r   ©r,   r÷   r÷   ©r,   rM   rM   ©r,   r,   r,   ©r   r   r   r   r   r   r    r   r   r   r   r   r   r    rÝ   rß   Fr­   çš™™™™™É?)r¸   r¹   rÅ   rƒ   r{   r­   rÖ   rØ   )r   ÚverifyÚranger¢   rC   r   rã   rá   ©rÖ   rØ   rÙ   ÚconfigrÅ   Úir'   r'   r(   r   œ  s„    
.ð.ðè,






ùÿ
ø	÷r   c                 K   sF  t  | ¡} g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gd	œ}g }tt|d
 ƒƒD ]Z}| t|d
 | |d | |d | |d | |d | |d | |d | d	¡ q¾tf dd|dddd| dd¡| |dœ
|¤ŽS )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    rö   )rù   rù   rú   rú   rû   rû   rû   rû   rû   rû   rû   rû   rû   rû   rû   rü   rø   rý   r  rþ   rÿ   r   r  r   r   r   r   r   r   r    rÝ   rß   TFr­   r  )
r¸   r¹   rÅ   rƒ   r{   r„   r¬   r­   rÖ   rØ   )r   r  r  r¢   rC   r   rã   rá   r  r'   r'   r(   r   þ  sÈ    
ððððÆN






ùÿ
öõr   );r—   Úcollections.abcr   Zdataclassesr   Ú	functoolsr   Útypingr   r   r   rR   Ztorch.fxZtorch.nnrD   Úopsr	   r
   Ztransforms._presetsr   Úutilsr   Z_apir   r   r   Z_metar   Ú_utilsr   r   Ú__all__r   r$   r/   ra   rb   r9   r;   ZfxÚwrapr_   r<   ri   ry   r`   r}   r~   rª   r¶   r   r&   r©   rã   r   r   rô   r   r   r'   r'   r'   r(   Ú<module>   sp   	
 	,ø< H ú!!!$`