
    fTh                        S r SSKrSSKJr  SSKJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  \R6                  " \5      r\ " S S\5      5       r " S S\	R>                  5      r  " S S\	R>                  5      r! " S S\	R>                  5      r" " S S\	R>                  5      r# " S S\	R>                  5      r$ " S S\	R>                  5      r% " S S\	R>                  5      r& " S S \	R>                  5      r' " S! S"\	R>                  5      r( " S# S$\	R>                  5      r)\ " S% S&\5      5       r* " S' S(\	R>                  5      r+ " S) S*\	R>                  5      r,\+\,S+.r-\" S,S-9 " S. S/\*5      5       r. " S0 S1\	R>                  5      r/\" S2S-9 " S3 S4\*5      5       r0/ S5Qr1g)6zPyTorch TVP Model    N)	dataclass)OptionalTuple)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)auto_docstringlogging)load_backbone   )	TvpConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
TvpVideoGroundingOutput%   a^  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Temporal-Distance IoU loss for video grounding.
    logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
        input texts.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
Nlosslogits.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   __static_attributes__r       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   %   sq      )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>r$   r   c                   D   ^  \ rS rSrSrU 4S jrS rS rS rS r	Sr
U =r$ )	TvpLoss=   ab  
This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervise class and box).

Args:
    losses (`List[str]`):
        List of all the losses to be applied.
c                    > [         TU ]  5         U R                  U R                  U R                  S.U l        U H!  nX R
                  ;  d  M  [        SU S35      e   Xl        g )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr5   r   	__class__s      r%   r/   TvpLoss.__init__H   sa    ==****

 D==( 5n!=>>  r$   c                     [         R                  " XB5      [         R                  " X15      -
  n[         R                  " XB5      [         R                  " X15      -
  nSUR                  SS9U-  -
  nU$ )z&
Measure the intersection over union.
r   r   min)r    r;   maxclamp)	r6   
start_timeend_timecandidates_start_timecandidates_end_timer-   interunionr+   s	            r%   r0   TvpLoss.loss_iouU   s_     		-8599EZ;gg		-8599EZ;gg%++!+$u,,
r$   c                 P   [         R                  " [         R                  " X45      S5      n[         R                  " [         R                  " X5      S5      n[         R                  " [         R                  " Xg5      [         R                  " Xg5      -
  U5      R                  SS9nU$ )z%
Measure the distance of mid points.
g       @g?r:   )r    divaddr<   r;   r=   )	r6   r>   r?   r@   rA   r-   mid_candidatesmid_groundtruthdistance_diffs	            r%   r1   TvpLoss.loss_distance_   sy     599-B#XZ]^))EIIj$CSI		IIn6>9ccem

%C%. 	 r$   c                     [         R                  " XC5      n[         R                  " X!5      n[         R                  " [         R                  " [         R                  " Xg5      U5      5      nUR	                  SS9nU$ )z%
Measure the difference of duration.
g?r:   )r    subsquarerF   r=   )	r6   r>   r?   r@   rA   r-   duration_candidatesduration_groundtruthduration_diffs	            r%   r2   TvpLoss.loss_durationk   s`     $ii(;S$yy>UYYuyy9L/cem%no%+++4r$   c                    Uu  p4n[         R                  " X5      nUSS2S4   R                  5       USS2S4   R                  5       p0 n	U R                   H*  n
U	R	                  XR
                  U
   " XEXxU5      05        M,     U	$ )a5  
This performs the loss computation.

Args:
    logits (`torch.FloatTensor`):
        The output logits of head module.
    labels (`List[torch.FloatTensor]`):
        List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
Nr   r   )r    mulfloatr5   updater3   )r6   r   labelsr-   r>   r?   
candidatesr@   rA   losses_dictr   s              r%   forwardTvpLoss.forwardv   s     *0&hYYv0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KKD}}T*:AVmuvw  
 r$   )r3   r5   )r   r   r   r   r   r/   r0   r1   r2   rZ   r#   __classcell__r7   s   @r%   r'   r'   =   s&    
	 r$   r'   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVisionModel   c           
        > [         TU ]  5         [        U5      U l        UR                  b  UR                  R
                  S   nO[        U R                  S5      (       aI  [        U R                  R                  S5      (       a$  U R                  R                  R
                  S   nOl[        U R                  S5      (       aF  [        U R                  R                  S5      (       a!  U R                  R                  R                  nO[        S5      e[        R                  " UUR                  SSSSSS	9U l        g )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r.   r/   r   backbonebackbone_configrd   hasattrrc   re   r4   r   Conv2dgrid_encoder_conv)r6   rc   in_channelsr7   s      r%   r/   TvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H--'$--:N:NP^2_2_--..;;B?KT]]H--'$--:N:NP]2^2^--..::K899!#"
r$   c                    UR                   u  p#pEnUR                  X#-  XEU5      nU R                  U5      S   S   nU R                  U5      n[        R
                  R                  USSS9n[        R
                  R                  USS9nUR                   SS  u  pnUR                  X#XU5      nUR                  SSS	S
S5      nU$ )Nfeature_mapsr      )rf   rg   T)inplacer   r      )	shapeviewrk   ro   r   
functional
max_pool2drelupermute)r6   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r%   rZ   TvpVisionModel.forward   s    >J>P>P;
e#(()@,X]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*yy)T||Aq!Q*r$   )rk   ro   r   r   r   r   r/   rZ   r#   r\   r]   s   @r%   r_   r_      s    
. r$   r_   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSS	\
4S jjrSrU =r$ )TvpVisualInputEmbedding   z3
Takes input of both image and video (multi-frame)
c                 x  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l
        [        R                  " SUR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        UR                  U l        UR                  U l	        g )Nr   eps)r.   r/   r   	Embeddingmax_position_embeddingsre   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr6   rc   r7   s     r%   r/    TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r$   	embeddingr   r   returnc                    S=pEX R                   :  a  X R                   -  nX0R                  :  a  X0R                  -  nUR                  SSSS5      n[        R                  R                  UXE4SSS9nUR                  SSSS5      nU$ )z
This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
resolution images (high resolution videos).

r   r   r   rt   bicubicFscale_factormodealign_corners)r   r   r}   r   rz   interpolate)r6   r   r   r   h0w0s         r%   interpolate_pos_encoding0TvpVisualInputEmbedding.interpolate_pos_encoding   s     999???B888>>>B%%aAq1	MM--	 . 
	 %%aAq1	r$   r   c                    UR                   u  p4pV[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      n	S[        UR                   5      S-
  -  USU4-   n
U	R                  " U
6 n	[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      nUSX4nUR                  " U6 nX-   nU(       a4  X@R                  :  d  XPR                  :  a  XR                  XU5      -   nU$ X-   nU$ )a.  
Args:
    grid: (batch_size, height, width, hidden_dim)
    interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.
Returns:
    grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
dtypedevice)r   r   r   )rx   r;   r   r    arangelongr   r   lenry   r   r   r   )r6   r   r   r   r   r   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r%   add_2d_positional_embeddings4TvpVisualInputEmbedding.add_2d_positional_embeddings   s7    15

-
E >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	"9">">	"J ==uE	 <<	DKKX"&">">?O"PI:	"9">">	"J 7 Q $:::eFkFk>k778MW\]]D  /Dr$   c                 x   UR                   u  p4pVnUR                  S5      nU R                  XS9nUR                  USU5      nUR                   SS n	UR                  n
[
        R                  " U	[
        R                  U
S9nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )a  
Args:
    grid: Array of shape (batch_size, num_frames, height, width, num_channels).
        It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
        num_frames can be 1
    interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.

Returns:
    embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

r   r   rb   Nr   )rx   meanr   ry   r   r    zerosr   r   r   r   )r6   r   r   r   r   r   r   r   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r%   rZ   TvpVisualInputEmbedding.forward  s     ?Cjj;
|yy|000i		*b,?+11#26%% %8

SYZ $ : :> J":
__Z0
\\*-
r$   )r   r   r   r   r   r   r   r   F)r   r   r   r   r   r/   r    Tensorintr   boolr   rZ   r#   r\   r]   s   @r%   r   r      sY    
X%,,  TW \a\h\h .'4 'Rd  r$   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvpTextInputEmbeddingsi&  zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        g )N)padding_idxr   )r.   r/   r   r   
vocab_sizere   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r%   r/   TvpTextInputEmbeddings.__init__)  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r$   c                 .   Ub  UR                  5       nOUR                  5       S S nUS   nUb  UR                  OUR                  nUcD  [        R                  " U[        R                  US9nUR                  S5      R                  U5      nUc$  [        R                  " U[        R                  US9nUc  U R                  U5      nU R                  U5      nU R                  U5      n	XH-   U	-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nrb   r   r   r   )sizer   r    r   r   	unsqueezeexpandr   r   r   r   r   r   )r6   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r%   rZ   TvpTextInputEmbeddings.forward1  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"8;PP
__Z0
\\*-
r$   )r   r   r   r   r   )NNNN	r   r   r   r   r   r/   rZ   r#   r\   r]   s   @r%   r   r   &  s    Q> r$   r   c                   v   ^  \ rS rSrU 4S jrS rS\R                  S\S\4S jr	   SS\
\   4S	 jjrS
rU =r$ )TvpAttentioniJ  c                   > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R$                  " UR                  UR&                  S9U l        [        R                  " UR*                  5      U l        [/        5       U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r.   r/   re   num_attention_headsrm   r4   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r%   r/   TvpAttention.__init__K  s    : ::a?PVXhHiHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=Er$   c                 T  ^ [        U5      S:X  a  g [        R                  " U R                  U R                  5      n[        U5      U R                  -
  nU H*  mT[        U4S jU R                   5       5      -
  mSUT'   M,     UR                  S5      R                  5       R                  S5      n[        R                  " [        U5      5      U   R                  5       n[        U R                  U5      U l        [        U R                  U5      U l        [        U R                   U5      U l        [        U R"                  USS9U l        U R                  [        U5      -
  U l        U R                  U R                  -  U l        U R                  R'                  U5      U l        g )Nr   c              3   6   >#    U  H  oT:  a  S OSv   M     g7f)r   r   Nr   ).0hheads     r%   	<genexpr>+TvpAttention.prune_heads.<locals>.<genexpr>g  s     N<Mqt8a2<Ms   rb   r   dim)r   r    onesr   r   r   r   sumry   
contiguouseqr   r   r   r   r   r   r   r   rC   )r6   headsmaskindexr   s       @r%   prune_headsTvpAttention.prune_heads`  sI   u:?zz$22D4L4LME
T...D#ND<M<MNNNDDJ  yy}''),,Q/SY'-224 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r$   tensorsequence_lengthr   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   rt   )ry   r   r   	transposer   )r6   r  r  r   s       r%   _reshapeTvpAttention._reshapew  s5    KK
T5M5MtOgOghYq!_Z\	
r$   output_attentionsc                 2   UR                   S S u  pVU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  XvU5      n
U R	                  XU5      nU R	                  XU5      n[
        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      nUb  X-  n[
        R                  " X5      nUR                  SS5      R                  5       nUR!                  XVU R"                  5      nU R%                  U5      nU R'                  U5      nU R)                  X-   5      nU(       a  X4nU$ U4nU$ )Nrt   rb   r   r   )rx   r   r   r   r  r    matmulr  mathsqrtr   r   rz   softmaxr   r   reshaper   r   r   r   )r6   r   attention_mask	head_maskr	  r   r  mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r%   rZ   TvpAttention.forward~  s    '4&9&9"1&=#
 JJ}5((=1 JJ}5mm$5
SMM/JO	mm$5
S !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ++O<  -9Oll?@!++Aq1<<>!))*tGYGYZjj-ll;/ook&AB4E;0 MX>r$   )r   r   r   r   r   r   r   r   r   r   r   NNN)r   r   r   r   r/   r  r    r   r   r  r   r   rZ   r#   r\   r]   s   @r%   r   r   J  sN    "*;.
u|| 
c 
s 
 ,0+
 $D>+ +r$   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )TvpIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g N)r.   r/   r   r   re   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r%   r/   TvpIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r$   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r"  r   r'  )r6   r   s     r%   rZ   TvpIntermediate.forward  s&    

=100?r$   r*  
r   r   r   r   r/   r    r   rZ   r#   r\   r]   s   @r%   r   r     s(    9U\\ ell  r$   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )TvpOutputLayeri  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r.   r/   r   r   r#  re   r   r   r   r   r   r   r   r   s     r%   r/   TvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r$   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r"  r   r   r   )r6   r   r1  s      r%   rZ   TvpOutputLayer.forward  s5    

=1]3(DEr$   r3  r,  r]   s   @r%   r.  r.    s6    >U\\  RWR^R^  r$   r.  c                   F   ^  \ rS rSrU 4S jr   SS\\   4S jjrSrU =r	$ )TvpEncodeLayeri  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r"  )r.   r/   r   	attentionr   intermediater.  outputr   s     r%   r/   TvpEncodeLayer.__init__  s3    %f-+F3$V,r$   r	  c                     U R                  UUUUS9nUS   nUSS  nU R                  U5      nU R                  X5      n	U	4U-   nU$ )N)r	  r   r   r8  r9  r:  )
r6   r   r  r  r	  self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r%   rZ   TvpEncodeLayer.forward  so     "&/	 "0 "
 2!4(,"//0@A{{#6I/G+r$   r=  r  )
r   r   r   r   r/   r   r   rZ   r#   r\   r]   s   @r%   r6  r6    s+    - ,0
 $D> r$   r6  c            
       |   ^  \ rS rSrU 4S jr     S	S\\R                     S\\   S\\   S\\   4S jjr	Sr
U =r$ )

TvpEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r.   r/   rc   r   
ModuleListrangenum_hidden_layersr6  layergradient_checkpointing)r6   rc   _r7   s      r%   r/   TvpEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&r  r	  output_hidden_statesreturn_dictc                 t   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSn[	        U R
                  5       H|  u  pU(       a  Xq4-   nU R                  (       a8  U R                  (       a'  U R                  U
R                  UUUb  X9   OS U5      nOU
" XX9   U5      nUS   nU(       d  Mt  XS   4-   nM~     U(       a  Xq4-   nU(       d  U4nU(       a  X4-   nU(       a  X4-   nU$ [        UU(       a  UOS U(       a  US9$ S S9$ )Nr   r   r   )last_hidden_stater   r   )rc   rN  r	  rM  	enumeraterI  rJ  training_gradient_checkpointing_func__call__r	   )r6   r   r  r  r	  rM  rN  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r%   rZ   TvpEncoder.forward  s[    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4OA#$58H$H!**t}} $ A A ))!"%.%:Y\%! !-]ILZk l)!,M  !/3C2E!E#  5(   14D D$&G#!$88 !$55N+/C+):~
 	
 AE
 	
r$   )rc   rJ  rI  )NNNNN)r   r   r   r   r/   r   r    r!   r   rZ   r#   r\   r]   s   @r%   rD  rD    sb    , 15,0/3&*4
 E--.	4

 $D>4
 'tn4
 d^4
 4
r$   rD  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	TvpPooleri%  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r"  )r.   r/   r   r   re   r   Tanh
activationr   s     r%   r/   TvpPooler.__init__&  s9    YYv1163E3EF
'')r$   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r_  )r6   r   first_token_tensorpooled_outputs       r%   rZ   TvpPooler.forward+  s6     +1a40

#566r$   )r_  r   r,  r]   s   @r%   r\  r\  %  s(    $
U\\ ell  r$   r\  c                   &    \ rS rSr\rSrSrS rSr	g)TvpPreTrainedModeli4  modelTc                 F   [        U[        R                  [        R                  45      (       a9  UR                  R
                  R                  SU R                  R                  S9  Oh[        U[        R                  5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  S5        [        U[        R                  5      (       a1  UR                  b$  UR                  R
                  R                  5         [        U[        R                  5      (       ab  [        R                  R                  UR                  SSS9  UR                  b+  [        R                  R!                  UR                  S5        ggg)	zInitialize the weights        )r   stdg      ?Nfan_outr|   )r   nonlinearityr   )r$  r   r   r   weightdatanormal_rc   initializer_ranger   rj   zero_fill_rn   initkaiming_normal_	constant_)r6   modules     r%   _init_weights TvpPreTrainedModel._init_weights:  s   fryy",,788 MM&&CT[[5R5R&S--KK""$MM$$S)fbii((V[[-DKK""$fbii((GG##FMM	PV#W{{&!!&++q1 ' )r$   r   N)
r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrw  r#   r   r$   r%   rf  rf  4  s    L&*#2r$   rf  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )TvpFrameDownPadPrompteriM  z6
Pad frames extracted from videos only at the bottom.
c           	        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                  U l        UR                   U l         [        R                  " [        R                  " SUR
                  SUR                  UR                  /5      5      U l        g )NrG   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr4   r.   r/   visual_prompt_size	frame_nummax_img_sizer   	Parameterr    randnpad_downr   s     r%   r/    TvpFrameDownPadPrompter.__init__R  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r$   c                    U R                   S:w  ao  [        R                  " U R                  U R                  /UR                  UR
                  S9nSX R                  U R                  -
  U R                  2S S 24'   X-  nU R                   S:w  a  [        R                  " UR                  S   UR                  S   SU R                  U R                  /UR
                  S9nU R                  U R                  -
  nU R                  US S 2S S 2S S 2X@R                  2S S 24'   XR                  UR                  5      -  nU$ )	NrG   r   ri  r  r   r   r   r   )r  r    r   r  r   r   r  r   rx   r  to)r6   r~   visual_prompt_maskpromptstart_points        r%   rZ   TvpFrameDownPadPrompter.forward`  s(   %%.!&""D$5$56l>P>PYeYlYl" fi0043J3JJTM^M^^`aab.L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK*;*;;Q>?IIl&8&899Lr$   )r  r  r  r  r  r   r]   s   @r%   r}  r}  M  s    
 r$   r}  c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )TvpFramePadPrompterir  z7
Pad frames extracted from videos in the surroundings.
c           
        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                   U l         UR
                  UR                  S-  -
  U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        g )Nr  r  rt   r   r   )r  r4   r.   r/   r   r  r  	base_sizer   r  r    r  pad_upr  pad_left	pad_rightr   s     r%   r/   TvpFramePadPrompter.__init__w  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r$   r  r   r   r   c                     X R                   -  X0R                   -  pTUR                  u  pgpn
UR                  Xg-  XU
5      n[        R                  R                  UXE4SSS9nUR                  XgXU5      nU$ )z
This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
resolution images (high resolution videos).

r   Fr   )r  rx   r  r   rz   r   )r6   r  r   r   r   r   batchr   channelsprompt_heightprompt_widths              r%   interpolate_pad_encoding,TvpFramePadPrompter.interpolate_pad_encoding  s     +++U5F5F-FBCI<<@8L  2H\Z**	 + 
 8UKr$   r  c           	      ^   U(       a  UR                   S   UR                   S   4OU R                  U R                  4u  p4U R                  S;  a  [        SU R                   35      eU R                  S;   a/  [        R
                  " X4/UR                  UR                  S9nX-  nU R                  S;   a  [        R                  " SU R                  S	U R                  U R                  UR                  S
9n[        R                  " U R                  X`R                  /SS9n[        R                  " U R                  XpR                  /S	S9n[        R                  " UR!                  S5      U/-  5      nU(       a  U R#                  XsU5      nXR%                  UR                  5      -   nU$ )Nr  rb   )rG   r  r  z$Invalid visual_prompter_apply value )r  r  r   )r  rG   r   r   r  rw   r   r   )rx   r  r  r4   r    r   r   r   r   r   r  catr  r  r  r  r   r  r  )r6   r~   r  r   r   r  baser  s           r%   rZ   TvpFramePadPrompter.forward  sl    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VO<CUCU^j^q^q!r.L%%);;;;q$//1dnndnn]i]p]pqDYYt^^D!LFYYV]]CKFYY|003vh>?F'66vuM'))L4F4F*GGLr$   )r  r  r   r  r  r  r  r  r   )r   r   r   r   r   r/   r    r   r   r  r   rZ   r#   r\   r]   s   @r%   r  r  r  sL    $
Lu|| S QT Y^YeYe 0d  r$   r  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                      ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\   S\	\   S\	\   S\4S jj5       rSrU =r$ )TvpModeli  c                 ,  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U5      U l        [        R                  " [        R                   " SSUR"                  /5      5      U l        [        R&                  " UR(                  5      U l        UR,                  [.        ;  a  [1        S5      e[.        UR,                     " U5      U l        U R5                  5         g )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r.   r/   rc   r_   vision_modelr   r   r   visual_embeddingsrD  encoderr\  poolerr   r  r    r  re   text_promptr   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr4   visual_prompter	post_initr   s     r%   r/   TvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r$   c                 .    U R                   R                  $ r"  r   r   )r6   s    r%   get_input_embeddingsTvpModel.get_input_embeddings  s    ...r$   c                 $    XR                   l        g r"  r  )r6   r   s     r%   set_input_embeddingsTvpModel.set_input_embeddings  s    */'r$   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsr  rI  r8  r  )r6   heads_to_prunerI  r   s       r%   _prune_headsTvpModel._prune_heads  s<     +002LELLu%//;;EB 3r$   r   r~   r  r  r	  rM  rN  r   c	           	         Ub  UOU R                   R                  nU R                  U R                  X(S95      nU R	                  US9n	U R                  X(S9n
Ub  UR                  U
R                  SS 5      n[        R                  " UR                  S   S5      R                  UR                  UR                  S9n[        R                  " XU/S	S
9nU R                  X1R                  5       5      R                  UR                  5      nU R                   R#                  U	R                  S   S	S	5      n[        R                  " XU
/SS
9nU R%                  UUU R'                  X@R                   R(                  5      UUUS9nU(       a  UR*                  OUS   nU R-                  U5      nU R/                  U5      nU R/                  U5      nU(       d
  UU4USS -   $ [1        UUUR2                  UR4                  S9$ )a  
Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpModel

>>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  )r   r   rt   r   r  )r   r   rb   r   r   )r  r  r	  rM  rN  )rP  pooler_outputr   r   )rc   rN  r  r  r   r  new_onesrx   r    r   r  r   r   r  get_extended_attention_maskr   r  r   r  get_head_maskrH  rP  r  r   r
   r   r   )r6   r   r~   r  r  r	  rM  rN  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskr  embedding_outputencoder_outputsrP  rc  s                     r%   rZ   TvpModel.forward  s   4 &1%<k$++BYBY((   a
 !%) D"&"8"8 #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==nnnN^_bbclcscstN&&--.C.I.I!.LbRTU 99kJa%bhij,,)((KK4Q4QR/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r$   )	rc   r   r   r  r  r  r  r  r  )NNNNNNNF)r   r   r   r   r/   r  r  r  r   r   r    
LongTensorr!   r   rZ   r#   r\   r]   s   @r%   r  r    s     /0C  15485915,0/3&*).F
E,,-F
 u001F
 !!1!12	F

 E--.F
 $D>F
 'tnF
 d^F
 #'F
 F
r$   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVideoGroundingHeadi?  c                 B  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  S-  S5      U l        [        R                  " 5       U l        [        R                  " 5       U l
        g )Nrt   )r.   r/   r   r   re   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r%   r/   TvpVideoGroundingHead.__init__@  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr$   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU$ r"  )r  r  r  r  )r6   r  r   s      r%   rZ   TvpVideoGroundingHead.forwardG  s9    ""4<<#>?""4<<#78r$   )r  r  r  r  r   r]   s   @r%   r  r  ?  s    ) r$   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                     ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\
\R                        S\\R                     S\\   S	\\   S
\\   S\4S jj5       rSrU =r$ )TvpForVideoGroundingiM  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r"  )r.   r/   rc   r  rg  r  video_grounding_headr  r   s     r%   r/   TvpForVideoGrounding.__init__S  s8     f%
$9&$A!r$   r   r~   r  rW   r  r	  rM  rN  r   c
                    Ub  UOU R                   R                  nU R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUbo  [	        / SQ5      nUR                  U R                  5        U" X5      nUS   U R                   R                  US   -  -   U R                   R                  US   -  -   nU(       d  U4U
SS -   n
Ub  U4U
-   n
U
$ [        UUU
R                  U
R                  S	9$ )
a  
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
    The labels contains duration, start time, and end time of the video corresponding to the text.

Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

>>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  r	  rM  rN  r   r   r*   r+   r,   r-   rt   )r   r   r   r   )rc   rN  rg  r  r'   r  r   distance_loss_weightduration_loss_weightr   r   r   )r6   r   r~   r  rW   r  r	  rM  rN  r   r  r  r   r   	criterion	loss_dicts                   r%   rZ   TvpForVideoGrounding.forward[  s*   < &1%<k$++BYBY**/!5#%=  	
  
**=9 ?@ILL%!&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r$   )rc   rg  r  )	NNNNNNNNF)r   r   r   r   r/   r   r   r    r  r!   r   r   r   rZ   r#   r\   r]   s   @r%   r  r  M  s      1548590415,0/3&*).@
E,,-@
 u001@
 !!1!12	@

 u||,-@
 E--.@
 $D>@
 'tn@
 d^@
 #'@
 @
r$   r  )r  rf  r  )2r   r  dataclassesr   typingr   r   r    torch.utils.checkpointr   activationsr   modeling_outputsr	   r
   r   modeling_utilsr   pytorch_utilsr   utilsr   r   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   Moduler'   r_   r   r   r   r   r.  r6  rD  r\  rf  r}  r  r  r  r  r  __all__r   r$   r%   <module>r     s     ! "    ! X X - / , 1 ( 
		H	% ?k ? ?.Mbii M`%RYY %Pnbii nb!RYY !H_299 _Fbii RYY RYY 8;
 ;
~		  2 2 20"bii "JW")) Wv ,#   
e
! e

e
PBII  
J
- J

J
Z Er$   