
    fTh|                     ~   S r SSKJrJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJr  SSKJr  SSKJrJrJr  SS	KJrJr  SS
KJrJr  SSKJrJrJr  SSKJr  \R>                  " \ 5      r! " S S\	RD                  5      r# " S S\	RD                  5      r$ S3S\	RD                  S\RJ                  S\RJ                  S\RJ                  S\\RJ                     S\&S\&4S jjr' " S S\	RD                  5      r( " S S\	RD                  5      r) " S S\	RD                  5      r* " S  S!\	RD                  5      r+ " S" S#\	RD                  5      r, " S$ S%\	RD                  5      r- " S& S'\	RD                  5      r. " S( S)\	RD                  5      r/\ " S* S+\5      5       r0\ " S, S-\05      5       r1\" S.S/9 " S0 S1\05      5       r2/ S2Qr3g)4zPyTorch ViViT model.    )CallableOptionalSetTupleUnionN)nn)CrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )VivitConfigc                   >   ^  \ rS rSrSrU 4S jrSS\4S jjrSrU =r	$ )VivitTubeletEmbeddings#   az  
Construct Vivit Tubelet embeddings.

This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
(width // tubelet_size[2]).
c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  S   -  U R                  U R
                  S   -  -  U R                  U R
                  S   -  -  U l        UR                  U l        [        R                  " UR                  UR                  UR                  UR                  S9U l        g )N   r   r   )kernel_sizestride)super__init__
num_frames
image_sizetubelet_size
patch_sizenum_patcheshidden_size	embed_dimr   Conv3dnum_channels
projectionselfconfig	__class__s     `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vivit/modeling_vivit.pyr!   VivitTubeletEmbeddings.__init__.   s     ++ ++ --__ 22$//!"446$//!"446 	
  ++))!3!3ATAT]c]p]p
    interpolate_pos_encodingc                 b   UR                   u  p4pVnU(       dP  X`R                  :w  d  XpR                  :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR                  SSSSS	5      nU R	                  U5      nUR                  S5      R                  SS5      nU$ )
NzImage image size (*z) doesn't match model (r   r   z).r   r      )shaper#   
ValueErrorpermuter+   flatten	transpose)	r-   pixel_valuesr3   
batch_sizer"   r*   heightwidthxs	            r0   forwardVivitTubeletEmbeddings.forward>   s    >J>P>P;
e'V-F%SbSbJb$VHAeW4KDOO\]L^K__`aeapapqras`ttvw 
 $++Aq!Q:OOL) IIaL""1a(r2   )r(   r#   r"   r&   r%   r+   F)
__name__
__module____qualname____firstlineno____doc__r!   boolrA   __static_attributes____classcell__r/   s   @r0   r   r   #   s    
 d  r2   r   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )VivitEmbeddingsO   z|
Vivit Embeddings.

Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        [        R                  " [        R
                  " SU R                  R                  S-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                  SS  U l        Xl        g )Nr   )r    r!   r   	Parametertorchzerosr'   	cls_tokenr   patch_embeddingsr&   position_embeddingsDropouthidden_dropout_probdropoutr$   r%   r.   r,   s     r0   r!   VivitEmbeddings.__init__V   s    ekk!Q8J8J&KL 6v >#%<<KK400<<q@&BTBTU$
  zz&"<"<= --ab1r2   
embeddingsr>   r?   returnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr   g      ?r   r   bicubicF)sizemodealign_cornersdim)r7   rV   rR   jit
is_tracingr%   r   reshaper9   r   
functionalinterpolateviewcat)r-   r[   r>   r?   r&   num_positionsclass_pos_embedpatch_pos_embedrd   
new_height	new_widthsqrt_num_positionss               r0   r3   (VivitEmbeddings.interpolate_pos_encodingd   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr2   r3   c                 "   UR                   u  p4pVnU R                  XS9nU R                  R                  USS/5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )Nr3   r   rc   )	r7   rU   rT   tilerR   rk   r3   rV   rY   )
r-   r<   r3   r=   r"   r*   r>   r?   r[   
cls_tokenss
             r0   rA   VivitEmbeddings.forward   s    >J>P>P;
e**<*k
^^((*a);<
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r2   )rT   r.   rY   rU   r%   rV   rC   )rD   rE   rF   rG   rH   r!   rR   Tensorintr3   rI   rA   rJ   rK   rL   s   @r0   rN   rN   O   sQ    &D5<< &D &DUX &D]b]i]i &DPd  r2   rN   modulequerykeyvalueattention_maskscalingrY   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr^   )rd   dtype)ptrainingr   r   )rR   matmulr;   r   rh   softmaxfloat32tor   rY   r   
contiguous)
rz   r{   r|   r}   r~   r   rY   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r2   c            
          ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr SS\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )VivitSelfAttention   r.   r\   Nc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r    r!   r'   num_attention_headshasattrr8   r.   ry   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr{   r|   r}   r,   s     r0   r!   VivitSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r2   r@   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr^   r   r   r   r   )r`   r   r   rj   r9   )r-   r@   new_x_shapes      r0   transpose_for_scores'VivitSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r2   	head_maskoutput_attentionsc                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n[        nU R
                  R                  S:w  aT  U R
                  R                  S:X  a  U(       a  [        R                  S5        O[        U R
                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pUR                  5       S S U R                  4-   n
UR!                  U
5      nU(       a  X4nU$ U4nU$ )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   rY   r   )r   r|   r}   r{   r   r.   _attn_implementationloggerwarning_oncer   r   r   r   r   r`   r   rg   )r-   hidden_statesr   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss               r0   rA   VivitSelfAttention.forward   s9    --dhh}.EF	//

=0IJ//

=0IJ(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r2   )
r   r   r.   r   r   r|   r   r{   r   r}   NF)rD   rE   rF   rG   r   r!   rR   rx   r   r   rI   r   r   rA   rJ   rK   rL   s   @r0   r   r      s    ]{ ]t ](%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F! !r2   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )VivitSelfOutput   z
The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r.   r\   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r    r!   r   r   r'   denserW   rX   rY   r,   s     r0   r!   VivitSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r2   r   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r   r   rY   r-   r   r   s      r0   rA   VivitSelfOutput.forward  s$    

=1]3r2   r   )rD   rE   rF   rG   rH   r   r!   rR   rx   rA   rJ   rK   rL   s   @r0   r   r      sI    
>{ >t >
U\\  RWR^R^  r2   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )VivitAttentioni  r.   r\   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r    r!   r   	attentionr   outputsetpruned_headsr,   s     r0   r!   VivitAttention.__init__  s0    +F3%f-Er2   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rc   )lenr   r   r   r   r   r   r{   r|   r}   r   r   r   union)r-   r   indexs      r0   prune_headsVivitAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r2   r   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )r-   r   r   r   self_outputsattention_outputr   s          r0   rA   VivitAttention.forward)  sC     ~~m@QR;;|AF#%QR(88r2   )r   r   r   r   )rD   rE   rF   rG   r   r!   r   ry   r   rR   rx   r   rI   r   r   rA   rJ   rK   rL   s   @r0   r   r     s    "{ "t ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitIntermediatei7  c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r    r!   r   r   r'   intermediate_sizer   rW   rX   rY   
isinstance
hidden_actstrr   intermediate_act_fnr,   s     r0   r!   VivitIntermediate.__init__8  sv    YYv1163K3KL
zz&"<"<=f''--'-f.?.?'@D$'-'8'8D$r2   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   rY   )r-   r   s     r0   rA   VivitIntermediate.forwardA  s4    

=100?]3r2   )r   rY   r   rD   rE   rF   rG   r!   rA   rJ   rK   rL   s   @r0   r   r   7  s    9 r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitOutputiI  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r    r!   r   r   r   r'   r   rW   rX   rY   r,   s     r0   r!   VivitOutput.__init__J  sB    YYv779K9KL
zz&"<"<=r2   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r0   rA   VivitOutput.forwardO  s,    

=1]3%4r2   r   r   rL   s   @r0   r   r   I  s    >
 r2   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )
VivitLayeriY  zNThis corresponds to the EncoderBlock class in the scenic/vivit implementation.c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r    r!   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr'   layer_norm_epslayernorm_beforelayernorm_afterr,   s     r0   r!   VivitLayer.__init__\  s    '-'E'E$'/-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr2   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r   r   r   r   )r-   r   r   r   self_attention_outputsr   r   layer_outputs           r0   rA   VivitLayer.forwardf  s    !%!!-0/	 "0 "
 2!4(, )8 ++M:((6 {{<?/G+r2   )r   r   r   r   r   r   r   r   )	rD   rE   rF   rG   rH   r!   rA   rJ   rK   rL   s   @r0   r   r   Y  s    X[ r2   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )VivitEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r    r!   r.   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r-   r.   _r/   s      r0   r!   VivitEncoder.__init__  sR    ]]fF^F^@_#`@_1Jv$6@_#`a
&+# $as   A&c                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  ).0vs     r0   	<genexpr>'VivitEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )r-   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r0   rA   VivitEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r2   )r.   r   r   )NFFTr   rL   s   @r0   r   r     s     , ")
 )
r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VivitPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r    r!   r   r   r'   r   Tanh
activationr,   s     r0   r!   VivitPooler.__init__  s9    YYv1163E3EF
'')r2   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r-   r   first_token_tensorpooled_outputs       r0   rA   VivitPooler.forward  s6     +1a40

#566r2   )r  r   r   rL   s   @r0   r  r    s    $
 r2   r  c                   6    \ rS rSr\rSrSrSr/ r	Sr
SrS rSrg)VivitPreTrainedModeli  vivitr<   Tc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       aI  UR                   R
                  R                  5         UR"                  R
                  R                  5         gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r   r)   weightdatanormal_r.   initializer_ranger   zero_	Embeddingpadding_idxr   fill_rN   rT   rV   )r-   rz   s     r0   _init_weights"VivitPreTrainedModel._init_weights  sU   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)00!!'')&&++113 1r2   r  N)rD   rE   rF   rG   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r1  rJ   r  r2   r0   r$  r$    s.    L$O&*#N!4r2   r$  c                      ^  \ rS rSrSU 4S jjrS rS r\      SS\\	R                     S\\	R                     S\\   S\\   S	\S
\\   S\\\	R                     \4   4S jj5       rSrU =r$ )
VivitModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r    r!   r.   rN   r[   r   encoderr   r   r'   r   	layernormr  pooler	post_init)r-   r.   add_pooling_layerr/   s      r0   r!   VivitModel.__init__  si    
 	 )&1#F+f&8&8f>S>ST->k&)D 	r2   c                 .    U R                   R                  $ r   )r[   rU   )r-   s    r0   get_input_embeddingsVivitModel.get_input_embeddings  s    ///r2   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)zy
Prunes heads of the model.

Args:
    heads_to_prune:
        dict of {layer_num: list of heads to prune in this layer}
N)itemsr=  r   r   r   )r-   heads_to_pruner   r   s       r0   _prune_headsVivitModel._prune_heads  s<     +002LELLu%//;;EB 3r2   r<   r   r   r  r3   r  r\   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X R                   R                  5      nU R                  XS9nU R                  UUUUUS9nUS   n	U R                  U	5      n	U R                  b  U R                  U	5      OSn
U(       d	  X4USS -   $ [        U	U
UR                  UR                  S9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`List[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```Nz You have to specify pixel_valuesrt   )r   r   r  r  r   r   )r
  pooler_outputr   r  )r.   r   r  use_return_dictr8   get_head_maskr   r[   r=  r>  r?  r   r   r  )r-   r<   r   r   r  r3   r  embedding_outputencoder_outputssequence_outputr!  s              r0   rA   VivitModel.forward  s#   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@&&y++2O2OP	??<?k,,/!5# ' 
 *!,..98<8OO4UY#3oab6III)-')77&11	
 	
r2   )r.   r[   r=  r>  r?  )T)NNNNFN)rD   rE   rF   rG   r!   rD  rI  r   r   rR   FloatTensorrI   r   r   r   rA   rJ   rK   rL   s   @r0   r;  r;    s    "0	C  5915,0/3).&*s
u001s
 E--.s
 $D>	s

 'tns
 #'s
 d^s
 
uU&&')CC	Ds
 s
r2   r;  a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\R                     \4   4S jj5       rSrU =r$ )VivitForVideoClassificationix  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)rA  r   )r    r!   
num_labelsr;  r%  r   r   r'   Identity
classifierr@  r,   s     r0   r!   $VivitForVideoClassification.__init__  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r2   r<   r   labelsr   r  r3   r  r\   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   n	U R                  U	SS2SSS24   5      n
SnUb~  U R                  S:X  a2  [        5       nU" U
R                  S5      UR                  S5      5      nO<[        5       nU" U
R                  SU R                  5      UR                  S5      5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`List[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```N)r   r   r  r3   r  r   r   r^   r   )losslogitsr   r  )r.   rM  r%  rZ  rX  r
   rj   r	   r   r   r  )r-   r<   r   r\  r   r  r3   r  r   rQ  r_  r^  loss_fctr   s                 r0   rA   #VivitForVideoClassification.forward  s   z &1%<k$++B]B]**/!5%=#  
 "!*Aq!9:!#"9BRA+-B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r2   )rZ  rX  r%  )NNNNNFN)rD   rE   rF   rG   r!   r   r   rR   rS  
LongTensorrI   r   r   r   rA   rJ   rK   rL   s   @r0   rV  rV  x  s    
  5915-1,0/3).&*~
u001~
 E--.~
 ))*	~

 $D>~
 'tn~
 #'~
 d^~
 
uU&&')>>	?~
 ~
r2   rV  )r;  r$  rV  )r   )4rH   typingr   r   r   r   r   rR   torch.utils.checkpointr   torch.nnr	   r
   activationsr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_vivitr   
get_loggerrD   r   Moduler   rN   rx   floatr   r   r   r   r   r   r   r   r  r$  r;  rV  __all__r  r2   r0   <module>rp     s    8 8    . ! b b F Q 7 7 , 
		H	%)RYY )XLbii Ln %II%<<% 
% <<	%
 U\\*% % %>; ;~bii &$RYY $N		 $"))  $ $N0
299 0
f"))  4? 4 4: T
% T
 T
n L
"6 L
L
^ Pr2   