
    fThX                     $   S r SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJr  SSKrSSKrSSKrSSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  \$RT                  " \+5      r,\ " S S\"5      5       r-\ " S S\"5      5       r.S r/ " S S\R`                  5      r1 " S S\R`                  5      r2 S>S\R`                  S\Rf                  S\Rf                  S\Rf                  S\	\Rf                     S\4S \44S! jjr5 " S" S#\R`                  5      r6 " S$ S%\R`                  5      r7 " S& S'\R`                  5      r8 " S( S)\R`                  5      r9 " S* S+\R`                  5      r: " S, S-\R`                  5      r; " S. S/\R`                  5      r<\# " S0 S1\5      5       r=\# " S2 S3\=5      5       r> " S4 S5\R`                  5      r?\#" S6S79 " S8 S9\=5      5       r@\#" S:S79 " S; S<\=5      5       rA/ S=QrBg)?z,PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)CallableOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD   )VideoMAEConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)VideoMAEDecoderOutput,   a  
Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.

Args:
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    r   torchFloatTensor__annotations__r!   r   r"   __static_attributes__r#       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/videomae/modeling_videomae.pyr   r   ,   sR      +/FHU&&'.8<M8E%"3"345<59Ju00129r-   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	VideoMAEForPreTrainingOutputC   au  
Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlossr    r!   r"   r#   )r$   r%   r&   r'   r(   r2   r   r)   r*   r+   r    r!   r   r"   r,   r#   r-   r.   r0   r0   C   sg    $ )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r-   r0   c                 v  ^ U4S jn[         R                  " [        U 5       Vs/ s H
  o2" U5      PM     sn5      n[         R                  " USS2SSS24   5      USS2SSS24'   [         R                  " USS2SSS24   5      USS2SSS24'   [
        R                  " U5      R                  S5      $ s  snf )z Sinusoid position encoding tablec           
         > [        T5       Vs/ s H%  o[        R                  " SSUS-  -  T-  5      -  PM'     sn$ s  snf )Ni'     )rangenppower)positionhid_jd_hids     r.   get_position_angle_vec;get_sinusoid_encoding_table.<locals>.get_position_angle_vecc   s?    RWX]R^_R^288E1
+;e+CDDR^___s   ,>Nr   r5   r   )r7   arrayr6   sincosr)   r*   	unsqueeze)
n_positionr;   r<   pos_isinusoid_tables    `   r.   get_sinusoid_encoding_tablerE   _   s    ` XX%PZJ[\J[5e<J[\]N ff^Aqt!tG%<=N1add7 ff^Aqt!tG%<=N1add7^,66q99	 ]s   B6c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VideoMAEEmbeddingsm   z/
Construct the patch and position embeddings.

c                    > [         TU ]  5         [        U5      U l        U R                  R                  U l        [        U R                  UR                  5      U l        Xl        g N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesrE   hidden_sizeposition_embeddingsconfigselfrR   	__class__s     r.   rL   VideoMAEEmbeddings.__init__s   sP     7 ?00<<#>t?O?OQWQcQc#d r-   c                    U R                  U5      nX0R                  R                  5       R                  U5      R	                  UR
                  SS9-   nUb'  UR                  u  pEnX2)    nUR                  USU5      nU$ )NTdevicecopy)rN   rQ   detachtype_astorY   shapereshape)rT   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelss          r.   forwardVideoMAEEmbeddings.forward|   s    **<8
  ":":"A"A"C"K"KJ"W"Z"Z$$4 #[ #
 


 &*4*:*:'J<#$45J#++JLIJr-   )rR   rO   rN   rQ   	r$   r%   r&   r'   r(   rL   rg   r,   __classcell__rU   s   @r.   rG   rG   m   s    
 r-   rG   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rM      ac  
Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
patch_size).

c           	        > [         T	U ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nUR                  n[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nX l        X0l        [        U5      U l        US   US   -  US   US   -  -  X`R                  -  -  nX@l        Xl        [        R                  " UUU R                  US   US   4U R                  US   US   4S9U l        g )Nr   r   )in_channelsout_channelskernel_sizestride)rK   rL   
image_size
patch_sizerf   rP   
num_framestubelet_size
isinstancecollectionsabcIterableintrO   r
   Conv3d
projection)
rT   rR   rs   rt   rf   rP   ru   rv   rO   rU   s
            r.   rL    VideoMAEPatchEmbeddings.__init__   s3   &&
&&
**((&&
**#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
$$-]jm+
1A0NOS]ararSrs 	 )&))$$**JqM:a=I%%z!}jmD	
r-   c                    UR                   u  p#pEnX@R                  :w  a  [        S5      eXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR	                  SSSS	S
5      nU R                  U5      R                  S5      R                  SS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r5   r      )r_   rf   
ValueErrorrs   permuter}   flatten	transpose)rT   ra   rd   ru   rf   heightwidthrc   s           r.   rg   VideoMAEPatchEmbeddings.forward   s    >J>P>P;
e,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  $++Aq!Q:__\2::1=GG1M
r-   )rs   rf   rO   rt   r}   rv   ri   rk   s   @r.   rM   rM      s    
6 r-   rM   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr[   )dimdtype)ptrainingr   r5   )r)   matmulr   r
   
functionalsoftmaxfloat32r^   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r-   c            
          ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr SS\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )VideoMAESelfAttention   rR   returnNc                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  SS9U l        UR&                  (       as  [        R(                  " [*        R,                  " U R                  5      5      U l        [        R(                  " [*        R,                  " U R                  5      5      U l        g S U l        S U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      Fbias)rK   rL   rP   num_attention_headshasattrr   rR   r{   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr
   Linearr   r   r   qkv_bias	Parameterr)   zerosq_biasv_biasrS   s     r.   rL   VideoMAESelfAttention.__init__   s    : ::a?PVXhHiHi"6#5#5"6 7334A7  #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EER
99V//1C1C%PYYv1143E3EER
??,,u{{43E3E'FGDK,,u{{43E3E'FGDKDKDKr-   xc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr[   r   r5   r   r   )sizer   r   viewr   )rT   r   new_x_shapes      r.   transpose_for_scores*VideoMAESelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r-   	head_maskoutput_attentionsc                    U R                   b  [        R                  " U R                  SS9OS n[        R
                  R                  XR                  R                  US9n[        R
                  R                  XR                  R                  U R                  S9n[        R
                  R                  XR                  R                  U R                   S9nU R                  U5      nU R                  U5      n	U R                  U5      n
[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R!                  S5        O["        U R                  R                     nU" U U
UU	UU R$                  U R&                  U R(                  (       d  SOU R*                  S9u  pUR-                  5       S S	 U R.                  4-   nUR1                  U5      nU(       a  X4nU$ U4nU$ )
NF)requires_grad)inputweightr   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r   )r   r)   
zeros_liker   r
   r   linearr   r   r   r   r   r   rR   _attn_implementationloggerwarning_oncer   r   r   r   r   r   r   r`   )rT   r!   r   r   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                   r.   rg   VideoMAESelfAttention.forward  s    HL{{G^!!$++UCdh}}##-V\#]%%M**BSBSZ^ZeZe%f--&&]::CTCT[_[f[f&g--d3	//7//8(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r-   )r   r   rR   r   r   r   r   r   r   r   r   r   NF)r$   r%   r&   r'   r   rL   r)   Tensorr   r   boolr	   r   rg   r,   rj   rk   s   @r.   r   r      s    ~ $ 4%ell %u|| % bg&(0(>&Z^&	uU\\5<</0%2EE	F& &r-   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )VideoMAESelfOutputi,  z
The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rR   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rJ   )	rK   rL   r
   r   rP   denseDropouthidden_dropout_probr   rS   s     r.   rL   VideoMAESelfOutput.__init__2  sB    YYv1163E3EF
zz&"<"<=r-   r!   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r   rT   r!   r   s      r.   rg   VideoMAESelfOutput.forward7  s$    

=1]3r-   r   )r$   r%   r&   r'   r(   r   rL   r)   r   rg   r,   rj   rk   s   @r.   r   r   ,  sI    
>~ >$ >
U\\  RWR^R^  r-   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )VideoMAEAttentioni?  rR   r   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g rJ   )rK   rL   r   	attentionr   outputsetpruned_headsrS   s     r.   rL   VideoMAEAttention.__init__@  s0    .v6(0Er-   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rT   r   indexs      r.   prune_headsVideoMAEAttention.prune_headsF  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r-   r!   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )rT   r!   r   r   self_outputsattention_outputr   s          r.   rg   VideoMAEAttention.forwardX  sC     ~~m@QR;;|AF#%QR(88r-   )r   r   r   r   )r$   r%   r&   r'   r   rL   r   r{   r   r)   r   r   r   r	   r   rg   r,   rj   rk   s   @r.   r   r   ?  s    "~ "$ ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r-   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	VideoMAEIntermediateig  rR   r   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rJ   )rK   rL   r
   r   rP   intermediate_sizer   rw   
hidden_actstrr   intermediate_act_fnrS   s     r.   rL   VideoMAEIntermediate.__init__h  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r-   r!   c                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r  )rT   r!   s     r.   rg   VideoMAEIntermediate.forwardp  s&    

=100?r-   r  r$   r%   r&   r'   r   rL   r)   r   rg   r,   rj   rk   s   @r.   r   r   g  s6    9~ 9$ 9U\\ ell  r-   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )
VideoMAEOutputix  rR   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g rJ   )
rK   rL   r
   r   r   rP   r   r   r   r   rS   s     r.   rL   VideoMAEOutput.__init__y  sB    YYv779K9KL
zz&"<"<=r-   r!   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ rJ   r   r   s      r.   rg   VideoMAEOutput.forward~  s,    

=1]3%4r-   r   r  rk   s   @r.   r  r  x  sD    >~ >$ >
U\\  RWR^R^  r-   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )VideoMAELayeri  z?This corresponds to the Block class in the timm implementation.rR   r   Nc                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)rK   rL   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater  r   r
   	LayerNormrP   layer_norm_epslayernorm_beforelayernorm_afterrS   s     r.   rL   VideoMAELayer.__init__  s    '-'E'E$*6208$V, "V-?-?VEZEZ [!||F,>,>FDYDYZr-   r!   r   r   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r  r  r  r   )rT   r!   r   r   self_attention_outputsr   r   layer_outputs           r.   rg   VideoMAELayer.forward  s     "&!!-0/ "0 "

 2!4(, )8 ++M:((6 {{<?/G+r-   )r   r  r  r  r  r   r  r   )r$   r%   r&   r'   r(   r   rL   r)   r   r   r   r	   r   rg   r,   rj   rk   s   @r.   r  r    s    I[~ [$ [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r-   r  c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )VideoMAEEncoderi  rR   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
rK   rL   rR   r
   
ModuleListr6   num_hidden_layersr  layergradient_checkpointing)rT   rR   re   rU   s      r.   rL   VideoMAEEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&r!   r   r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr#   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frJ   r#   .0vs     r.   	<genexpr>*VideoMAEEncoder.forward.<locals>.<genexpr>  s     m$[q$[   	last_hidden_stater!   r"   )	enumerater#  r$  r   _gradient_checkpointing_func__call__tupler   )rT   r!   r   r   r&  r'  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r.   rg   VideoMAEEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r-   )rR   r$  r#  )NFFT)r$   r%   r&   r'   r   rL   r)   r   r   r   r	   r5  r   rg   r,   rj   rk   s   @r.   r  r    s    ,~ ,$ , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
 )
r-   r  c                   2    \ rS rSr\rSrSrSrSr	Sr
S rSrg)VideoMAEPreTrainedModeli  videomaera   Tc                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsr   )meanstdNg      ?)rw   r
   r   r|   r   datanormal_rR   initializer_ranger   zero_r  fill_)rT   r   s     r.   _init_weights%VideoMAEPreTrainedModel._init_weights  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r-   r#   N)r$   r%   r&   r'   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2rH  r,   r#   r-   r.   r>  r>    s(    !L"$O&*#N!
*r-   r>  c                      ^  \ rS rSrU 4S jrS rS r\     SS\R                  S\
\R                     S\
\R                     S\
\   S	\
\   S
\
\   S\\\4   4S jj5       rSrU =r$ )VideoMAEModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  S U l        O.[        R                  " UR                  UR                  S9U l        U R                  5         g )Nr  )rK   rL   rR   rG   rc   r  encoderuse_mean_pooling	layernormr
   r  rP   r  	post_initrS   s     r.   rL   VideoMAEModel.__init__  sg     ,V4&v.""!DN\\&*<*<&BWBWXDN 	r-   c                 .    U R                   R                  $ rJ   )rc   rN   )rT   s    r.   get_input_embeddings"VideoMAEModel.get_input_embeddings  s    ///r-   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrS  r#  r   r   )rT   heads_to_pruner#  r   s       r.   _prune_headsVideoMAEModel._prune_heads  s<    
 +002LELLu%//;;EB 3r-   ra   rb   r   r   r&  r'  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  X0R                   R
                  5      nU R                  X5      nU R                  UUUUUS9nUS   n	U R                  b  U R                  U	5      n	U(       d	  U	4USS -   $ [        U	UR                  UR                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
    length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import AutoImageProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`List[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 1568, 768]
```Nr   r   r&  r'  r   r   r0  )rR   r   r&  use_return_dictget_head_maskr"  rc   rS  rU  r   r!   r"   )
rT   ra   rb   r   r   r&  r'  embedding_outputencoder_outputssequence_outputs
             r.   rg   VideoMAEModel.forward  s    r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y++2O2OP	??<I,,/!5# ' 
 *!,>>%"nn_=O#%(;;;-)77&11
 	
r-   )rR   rc   rS  rU  )NNNNN)r$   r%   r&   r'   rL   rY  r^  r   r)   r*   r   
BoolTensorr   r   r	   r   r   rg   r,   rj   rk   s   @r.   rQ  rQ    s    0C  7;,0,0/3&*y
''y
 "%"2"23y
 ELL)	y

 $D>y
 'tny
 d^y
 
uo%	&y
 y
r-   rQ  c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )VideoMAEDecoderi  c                   > [         TU ]  5         UR                  UR                  -  UR                  S-  -  n[        U5      nUR                  Ul        UR                  Ul	        UR                  Ul        UR                  Ul        [        R                  " [!        UR                  5       Vs/ s H  n[#        U5      PM     sn5      U l        [        R&                  " UR                  5      U l        US:  a!  [        R*                  " UR                  U5      O[        R,                  " 5       U l        SU l        Xl        g s  snf )Nr5   r   F)rK   rL   rf   rv   rt   r   decoder_hidden_sizerP   decoder_num_hidden_layersr"  decoder_num_attention_headsr   decoder_intermediate_sizer   r
   r!  r6   r  decoder_layersr  normr   Identityheadr$  rR   )rT   rR   rO   decoder_num_labelsdecoder_configre   rU   s         r.   rL   VideoMAEDecoder.__init__  s
   #0063F3FFIZIZ\]I]]!&)%+%?%?"+1+K+K(-3-O-O*+1+K+K( mm49&:Z:Z4[\4[q]>*4[\
 LL!;!;<	I[^_I_BIIf002DEegepeper 		 ',# ]s   /Ec                 
   U(       a  SOS nU(       a  SOS n[        U R                  5       Hp  u  pU(       a  Xa4-   nU R                  (       a0  U R                  (       a  U R	                  U	R
                  US U5      n
OU	" US US9n
U
S   nU(       d  Mh  XzS   4-   nMr     U(       a  Xa4-   nUS:  a  US S 2U* S 24   nU R                  U5      nU R                  U5      nU(       d  [        S XU4 5       5      $ [        XUS9$ )Nr#   )r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frJ   r#   r*  s     r.   r-  *VideoMAEDecoder.forward.<locals>.<genexpr>  s     f$Tq$Tr/  )r    r!   r"   )
r2  rp  r$  r   r3  r4  rq  rs  r5  r   )rT   r!   return_token_numr   r&  r'  r6  r7  r8  r9  r;  r    s               r.   rg   VideoMAEDecoder.forward  s    #7BD$5b4()<)<=OA#$58H$H!**t}} $ A A ))!%	! !-]d^o p)!,M  &91=M<O&O##  >&   14D Da)!.>->-?*?@M 		-0=)fV@S$Tfff$F`sttr-   )rR   rp  r$  rs  rq  )FFT)r$   r%   r&   r'   rL   rg   r,   rj   rk   s   @r.   rj  rj    s    4  "*u *ur-   rj  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    )custom_introc                      ^  \ rS rSrU 4S jr\    SS\R                  S\R                  S\	\R                     S\	\   S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )VideoMAEForPreTrainingi  c                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " SSUR                  5      5      U l        [        U R                  R                  R                   UR                  5      U l        [%        XR                  R                  R                   S9U l        U R)                  5         g )NFr   r   )rO   )rK   rL   rR   rQ  r?  r
   r   rP   rl  encoder_to_decoderr   r)   r   
mask_tokenrE   rc   rO   rQ   rj  decoderrV  rS   s     r.   rL   VideoMAEForPreTraining.__init__  s     %f-"$))F,>,>@Z@Zaf"g,,u{{1a9S9S'TU#>MM$$00&2L2L$
  'v==;S;S;_;_` 	r-   ra   rb   r   r   r&  r'  r   c                 t   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   nU R                  U5      nUR                  u  pnUc  [        S5      eU R                  R                  U	SS5      R                  U5      nUR                  5       R                  UR                  SS9nX)    R                  U	SU5      nX   R                  U	SU5      n[        R                  " X-   U R                  U-   /SS	9nU R!                  XR                  S   5      nUR"                  nSn[        R$                  " 5          U R                   R&                  S
:w  a  UnOUR                  nUR(                  n[        R*                  " [,        5      R                  UUS9SSSS2SS4   n[        R*                  " [.        5      R                  UUS9SSSS2SS4   nUU-  U-   nUR                  u  n	nnnnU R                   R0                  U R                   R2                  nnU R                   R4                  (       a  UR7                  U	UU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS
5      R;                  5       nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U5      nUUR=                  SSS9-
  UR?                  SSSS9RA                  5       S-   -  nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U-  5      nOU R                   R&                  S
:w  a  [        S5      eUR7                  U	UU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS
5      R;                  5       nUR7                  U	UU-  U-  U-  U-  U-  UU-  U-  U-  5      nUR                  u  n	nnUU   R                  U	SU5      n SSS5        [C        5       n!U!" UW 5      nU(       d  U4USS -   n"Ub  U4U"-   $ U"$ [E        UUURF                  URH                  S9$ ! , (       d  f       N\= f)ae  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
    (image_size // patch_size) ** 2`.

Examples:
```python
>>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
>>> import numpy as np
>>> import torch

>>> num_frames = 16
>>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

>>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

>>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
>>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
>>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss = outputs.loss
```N)rb   r   r   r&  r'  r   z!One must provided a boolean mask r[   TrX   r   r   r   )rY   r   r      r5         r   )r   keepdim)r   unbiasedr  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r2   r    r!   r"   )%rR   rb  r?  r  r_   r   rQ   expandr]   r\   r^   rY   r`   r)   catr  r  r    no_gradrf   r   	as_tensorr   r   rv   rt   norm_pix_lossr   r   r   rA  varsqrtr   r0   r!   r"   )#rT   ra   rb   r   r   r&  r'  r   rf  rd   seq_lenrf   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr    r2   framesrY   r   rA  rB  timer   r   rv   rt   frames_normvideos_patchre   labelsloss_fctr   s#                                      r.   rg   VideoMAEForPreTraining.forward  s   J &1%<k$++B]B]--+/!5#   
 "!*11
 -<,A,A)
\ "@AA'+'?'?'F'FzSUWY'Z'b'bco'p$'C'J'J'L'O'OWcWjWjqu'O'v$67GHPPQ[]_amn3DLLZY[]ij O=tQ]?]^def ,,v/A/A!/DE '']]_{{''1,% &,,$**'<=@@V[@\]acgijlprv]vwoo&:;>>fTY>Z[_aeghjnpt[tu%+d2<BLL9JlFE'+{{'?'?AWAW*L{{((L(  j(Z'	  1aAq!Q?JJLL(61Z?%G:U :-
: 	  &D(IIJJ2dJCHHJTQ  +//L(61Z?%G:U :-
:\I  ;;++q0$k   L(  j(Z'	  1aAq!Q?JJL%{{L(61Z?%G:U :-
:\I  +7*<*<'J<!/2:::r<XFQ T 9'Y,F)-)9TGf$EvE+!//))	
 	
c _s   	J
P))
P7)rR   r  r  r  rQ   r?  )NNNN)r$   r%   r&   r'   rL   r   r)   r*   rh  r   r   r   r	   r5  r0   rg   r,   rj   rk   s   @r.   r~  r~    s    " 
 -1,0/3&*[
''[
 ))[
 ELL)	[

 $D>[
 'tn[
 d^[
 
u22	3[
 [
r-   r~  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\      SS\\R                     S\\R                     S\\R                     S\\	   S\\	   S\\	   S	\
\\4   4S
 jj5       rSrU =r$ )VideoMAEForVideoClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  (       a   [        R                  " UR                  5      OS U l	        UR                  S:  a+  [        R                  " UR                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rK   rL   
num_labelsrQ  r?  rT  r
   r  rP   fc_normr   rr  
classifierrV  rS   s     r.   rL   'VideoMAEForVideoClassification.__init__  s      ++%f- <B;R;Rr||F$6$67X\NTN_N_bcNc"))F$6$68I8IJikititiv 	r-   ra   r   r  r   r&  r'  r   c                    Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nU R                  b!  U R                  UR	                  S5      5      nO	USS2S4   nU R                  U5      n	Sn
UGb  U R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" U	R                  5       UR                  5       5      n
OU" X5      n
OU R                   R                  S:X  a=  [        5       nU" U	R                  SU R                  5      UR                  S5      5      n
O,U R                   R                  S:X  a  [!        5       nU" X5      n
U(       d  U	4USS -   nU
b  U
4U-   $ U$ [#        U
U	UR$                  UR&                  S	9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import torch
>>> import numpy as np

>>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`List[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 16 frames
>>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container, indices)

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
eating spaghetti
```Nra  r   r   
regressionsingle_label_classificationmulti_label_classificationr[   r  )rR   rb  r?  r  rA  r  problem_typer  r   r)   longr{   r   squeezer   r   r   r   r!   r"   )rT   ra   r   r  r   r&  r'  r   rf  r    r2   r  r   s                r.   rg   &VideoMAEForVideoClassification.forward  s   x &1%<k$++B]B]--/!5#   
 "!*<<#"ll?+?+?+BCO-ad3O1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r-   )r  r  r  r?  )NNNNNN)r$   r%   r&   r'   rL   r   r   r)   r   r   r	   r   r   rg   r,   rj   rk   s   @r.   r  r    s      04,0)-,0/3&*N
u||,N
 ELL)N
 &	N

 $D>N
 'tnN
 d^N
 
u++	,N
 N
r-   r  )r~  rQ  r>  r  )r   )Cr(   collections.abcrx   rZ   r   dataclassesr   typingr   r   r   r   r	   numpyr7   r)   torch.utils.checkpointr
   torch.nnr   r   r   activationsr   modeling_outputsr   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   utils.constantsr   r   configuration_videomaer   
get_loggerr$   r   r   r0   rE   ModulerG   rM   r   floatr   r   r   r   r   r  r  r  r>  rQ  rj  r~  r  __all__r#   r-   r.   <module>r     s@   3   ! 8 8     A A ! F F Q 
 K 2 
		H	% :K : :, :; : :6: B2bii 2z %II%<<% 
% <<	%
 U\\*% % %<FBII FT &$		 $P299 "RYY  'BII 'V0
bii 0
f *o * ** U
+ U
 U
pAubii AuH 
n
4 n

n
b ]
%< ]
]
@ sr-   