
    fTh                       S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSKJrJ r J!r!  SSK"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)  \%RT                  " \+5      r,\ " S S\#5      5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1S\00r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S S\R\                  5      r5 " S  S!\R\                  5      r6 " S" S#\R\                  5      r7\$ " S$ S%\5      5       r8 " S& S'\R\                  5      r9 " S( S)\R\                  5      r: SGS*\R\                  S+\
Rv                  S,\
Rv                  S-\
Rv                  S.\\
Rv                     S/\<S0\<4S1 jjr= " S2 S3\R\                  5      r> " S4 S5\R\                  5      r? " S6 S7\R\                  5      r@ " S8 S9\R\                  5      rA\$" S:S;9 " S< S=\85      5       rB " S> S?\R\                  5      rC\$" S@S;9 " SA SB\85      5       rD\$" SCS;9 " SD SE\8\5      5       rE/ SFQrFg)HzPyTorch GIT model.    N)	dataclass)CallableListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )	GitConfigGitVisionConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
GitVisionModelOutput2   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r#   r   torchFloatTensor__annotations__r$   r%   r   r&   __static_attributes__r'       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/git/modeling_git.pyr!   r!   2   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r1   r!   c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\
S\R                  4
S	 jjrS
rU =r$ )GitEmbeddingsP   z;Construct the embeddings from word and position embeddings.c                 :  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  g )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r	   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr:   register_bufferr-   arangeexpandselfconfig	__class__s     r2   rB   GitEmbeddings.__init__S   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r1   	input_idsr<   inputs_embedspast_key_values_lengthreturnc                 N   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nOUnU R                  S:X  a  U R	                  U5      nXx-  nU R                  U5      nU R                  U5      nU$ )Nr>   r   r;   )sizer<   rG   r:   rI   rJ   rN   )	rT   rX   r<   rY   rZ   input_shape
seq_length
embeddingsrI   s	            r2   forwardGitEmbeddings.forwardb   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H-J^^J/
\\*-
r1   )rJ   rN   r:   rI   rG   )NNNr   )r(   r)   r*   r+   r,   rB   r   r-   
LongTensorr.   intTensorra   r0   __classcell__rV   s   @r2   r4   r4   P   sx    E
" 153759&'E,,- u//0   1 12	
 !$ 
 r1   r4   c                     ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr     SS\R                  S\\R                     S\\R                     S	\\
   S
\\   S\\   S\\R                     4S jjrSrU =r$ )GitSelfAttention   c                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eX0l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  R                  UR                  R                   -  S-  S	-   5      U l        UR$                  b  U =R"                  UR$                  -  sl        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R0                  " UR2                  5      U l        U=(       d    [7        US
S5      U l        U R8                  S:X  d  U R8                  S:X  aH  UR:                  U l        [&        R<                  " SUR:                  -  S	-
  U R                  5      U l        g g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r:   r;   relative_keyrelative_key_query) rA   rB   rE   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerV   r(   rd   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr	   LinearquerykeyvaluerL   attention_probs_dropout_probrN   rO   r:   rH   rC   distance_embeddingrT   rU   r:   rt   rV   s       r2   rB   GitSelfAttention.__init__   s)    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr1   xr[   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr>   r   rn   r   r
   )r]   rq   rw   viewpermute)rT   r   new_x_shapes      r2   transpose_for_scores%GitSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r1   r%   attention_mask	head_maskpast_key_valueoutput_attentionspixel_values_presentc           	         U R                  U5      nU(       a  U R                  OSnU R                  U R                  U5      5      n	U R                  U R	                  U5      5      n
Ub  UR                  U	S S 2S S 2US 2S S 24   U
S S 2S S 2US 2S S 24   U R                  5      u  p[        R                  " U	S S 2S S 2S U2S S 24   U/SS9n	[        R                  " U
S S 2S S 2S U2S S 24   U/SS9n
U R                  U5      n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U	R                  S   nnUbB  [        R                  " US-
  [        R                  UR                  S	9R!                  SS5      nO>[        R"                  " U[        R                  UR                  S	9R!                  SS5      n[        R"                  " U[        R                  UR                  S	9R!                  SS5      nUU-
  nU R%                  UU R&                  -   S-
  5      nUR)                  UR*                  S
9nU R                  S:X  a  [        R,                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R,                  " SUU5      n[        R,                  " SU	U5      nUU-   U-   nU[.        R0                  " U R2                  5      -  nUb  X-   n[4        R6                  R9                  USS9nU R;                  U5      nUb  UU-  n[        R                  " UU
5      nUR=                  SSSS5      R?                  5       nURA                  5       S S U RB                  4-   nUR!                  U5      nU(       a  UU4OU4nUU4-   nU$ )Nr   rn   dimr>   ro   rp   r   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   )"r   r|   r   r   r   updatert   r-   catmatmul	transposer:   shapetensorlongr   r   rQ   r   rH   tor   einsummathsqrtrw   r	   
functionalsoftmaxrN   r   
contiguousr]   rx   )rT   r%   r   r   r   r   r   mixed_query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastquery_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r2   ra   GitSelfAttention.forward   s    !JJ}5,@((a--dhh}.EF	//

=0IJ%/=/D/D!Q*+[Avw9I-JDNN0,N 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K//0AB !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]^--r1   )rx   rw   r   rN   r|   r   rt   rH   rq   r:   r   r   NNNNNFF)r(   r)   r*   r+   rB   r-   re   r   r   r.   r   boolr   ra   r0   rf   rg   s   @r2   ri   ri      s     uD%ell %u|| % 7;15*.,1/4J||J !!2!23J E--.	J
 !J $D>J 'tnJ 
u||	J Jr1   ri   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )GitSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr8   )rA   rB   r	   r~   rE   denserJ   rK   rL   rM   rN   rS   s     r2   rB   GitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r%   input_tensorr[   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rN   rJ   rT   r%   r   s      r2   ra   GitSelfOutput.forward   5    

=1]3}'CDr1   rJ   r   rN   
r(   r)   r*   r+   rB   r-   re   ra   r0   rf   rg   s   @r2   r   r      6    >U\\  RWR^R^  r1   r   eagerc                      ^  \ rS rSrSU 4S jjrS r     SS\R                  S\\R                     S\\R                     S\\
   S\\   S	\\   S
\\R                     4S jjrSrU =r$ )GitAttentioni	  c                    > [         TU ]  5         [        UR                     " XUS9U l        [        U5      U l        [        5       U l        g )N)r:   rt   )	rA   rB   GIT_SELF_ATTENTION_CLASSES_attn_implementationrT   r   outputsetpruned_headsr   s       r2   rB   GitAttention.__init__
  sB    .v/J/JKy
	 $F+Er1   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rT   rq   rw   r   r   r   r   r   r   r   rx   union)rT   headsindexs      r2   prune_headsGitAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r1   r%   r   r   r   r   r   r[   c                 n    U R                  UUUUUU5      nU R                  US   U5      nU4USS  -   n	U	$ )Nr   r   )rT   r   )
rT   r%   r   r   r   r   r   self_outputsattention_outputr   s
             r2   ra   GitAttention.forward%  sT     yy 
  ;;|AF#%QR(88r1   )r   r   rT   r   r   )r(   r)   r*   r+   rB   r   r-   re   r   r.   r   r   r   ra   r0   rf   rg   s   @r2   r   r   	  s    ";* 7;15*.,1/4|| !!2!23 E--.	
 ! $D> 'tn 
u||	 r1   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitIntermediatei<  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rA   rB   r	   r~   rE   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrS   s     r2   rB   GitIntermediate.__init__=  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r1   r%   r[   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rT   r%   s     r2   ra   GitIntermediate.forwardE  s&    

=100?r1   r   r   rg   s   @r2   r   r   <  s(    9U\\ ell  r1   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	GitOutputiL  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rA   rB   r	   r~   r   rE   r   rJ   rK   rL   rM   rN   rS   s     r2   rB   GitOutput.__init__M  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r%   r   r[   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r2   ra   GitOutput.forwardS  r   r1   r   r   rg   s   @r2   r   r   L  r   r1   r   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\	   S\\
   S\\
   S	\\R                     4S
 jjrS rSrU =r$ )GitLayeriZ  c                    > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        g )Nr   )rt   )
rA   rB   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rT   rU   rt   rV   s      r2   rB   GitLayer.__init__[  sI    '-'E'E$%fB+F3'r1   r%   r   r   r   r   r   r[   c           	          U R                  UUUUUUS9nUS   nUSS n	US   n
[        U R                  U R                  U R                  U5      nU4U	-   n	X4-   n	U	$ )N)r   r   r   r   r   r>   )r   r   feed_forward_chunkr   r   )rT   r%   r   r   r   r   r   self_attention_outputsr   r   present_key_valuelayer_outputs               r2   ra   GitLayer.forwardc  s     "&/)!5 "0 "
 2!4 )2.2260##T%A%A4CSCSUe
  /G+ 00r1   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )rT   r   intermediate_outputr  s       r2   r  GitLayer.feed_forward_chunk  s)    "//0@A{{#6Ir1   )r   r   r   r   r   r   r   )r(   r)   r*   r+   rB   r-   re   r   r.   r   r   r   ra   r  r0   rf   rg   s   @r2   r   r   Z  s    ( 7;15*.,1/4 ||  !!2!23  E--.	 
 !  $D>  'tn  
u||	 D r1   r   c                   (  ^  \ rS rSrU 4S jr        SS\R                  S\\R                     S\\R                     S\\	\
\\\R                        4      S\\   S\\   S	\\   S
\\   S\\   S\	\\R                     \4   4S jjrSrU =r$ )
GitEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf NF)
rA   rB   rU   r	   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rT   rU   irV   s      r2   rB   GitEncoder.__init__  sR    ]]vG_G_A`#aA`AHV$7A`#ab
&+# $b   A&r%   r   r   past_key_values	use_cacher   output_hidden_statesr   return_dictr[   c
           	      ^   U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnSn
U(       aP  [	        U[
        5      (       d;  Sn
Uc  [        5       nO+[        R                  " U5      n[        R                  S5        U(       a  SOS nU(       a  SOS nS n[        U R                  5       H  u  pU(       a  X4-   nUb  X>   OS nU R                   (       a2  U R                  (       a!  U R                  UR                  UUUUU5      nOU" UUUUUU5      nUS   nU(       a  US   nU(       d  M  UUS   4-   nM     U(       a  X4-   nU(       a  UOS nU
(       a  UR                  5       nU	(       d  [        S	 UUUU4 5       5      $ [        UUUUS
9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r'   r   r>   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r'   .0vs     r2   	<genexpr>%GitEncoder.forward.<locals>.<genexpr>  s"      	A  s   	r$   r  r%   r&   )r  trainingru   rv   r   r   r   from_legacy_cache	enumerater  _gradient_checkpointing_func__call__to_legacy_cachetupler   )rT   r%   r   r   r  r  r   r  r   r  return_legacy_cacheall_hidden_statesall_self_attentionsnext_decoder_cacher  layer_modulelayer_head_masklayer_outputs
next_caches                      r2   ra   GitEncoder.forward  s    &&4==##p "	 $Z??"&&".."."@"@"Q##^ #7BD$5b4!(4OA#$58H$H!.7.CilO**t}} $ A A ))!"##%! !-!"##%(! *!,M%22%6"  &9]1=M<O&O#;  5>   14D D+4'$
#335J 	 "%'		 	 	 '+&+*	
 	
r1   )rU   r  r  )NNNNFFFT)r(   r)   r*   r+   rB   r-   re   r   r.   r   r   r   r   r   ra   r0   rf   rg   s   @r2   r  r    s    , 7;15SW$(,1/4/4&*Z
||Z
 !!2!23Z
 E--.	Z

 "%uU5;L;L5M/N(N"OPZ
 D>Z
 $D>Z
 'tnZ
 'tnZ
 d^Z
 
uU\\"$;;	<Z
 Z
r1   r  c                   .    \ rS rSr\rSrSrSrSr	S r
Srg)GitPreTrainedModeli  gitTc                    [        U[        5      (       a  [        R                  R	                  UR
                  SU R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        U[        R                  5      (       ak  UR                  R                  R	                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R	                  SU R                  R                  S9  UR                   b2  UR                  R                  UR                      R                  5         gg[        U[        R"                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R%                  S5        gg)zInitialize the weights        )meanstd)r9  Ng      ?)r   GitVisionEmbeddingsr	   initnormal_class_embeddingrU   initializer_rangepatch_embeddingweightposition_embeddingr~   databiaszero_rC   r7   rJ   fill_)rT   modules     r2   _init_weights GitPreTrainedModel._init_weights  s   f122GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r1   r'   N)r(   r)   r*   r+   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_quantized_cacherG  r0   r'   r1   r2   r4  r4    s$    L&*#  $*r1   r4  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )r:  i  rU   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestriderC  rn   r   r<   r=   r?   )rA   rB   rU   rE   	embed_dimrz   r{   r	   	Parameterr-   randnr=  Conv2dnum_channelsr?  num_patchesnum_positionsrC   rA  rP   rQ   rR   rS   s     r2   rB   GitVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr1   r`   heightwidthr[   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr>   g      ?r
   rn   bicubicF)r]   modealign_cornersr   )r   rA  r@  	unsqueezer-   jit
is_tracingr<   r{   r   reshaper   r	   r   interpolater   r   )rT   r`   r\  r]  rY  rA  rZ  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encoding,GitVisionEmbeddings.interpolate_pos_encoding$  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r   rn   r   r>   r   )r   rz   rs   r?  r@  r   r   flattenr   r=  rR   r-   r   rl  rA  r<   )rT   rn  rl  
batch_size_r\  r]  target_dtypepatch_embedsclass_embedsr`   s              r2   ra   GitVisionEmbeddings.forwardM  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr1   )	r=  rU   rT  rz   rY  rZ  r?  r{   rA  F)r(   r)   r*   r+   r   rB   r-   re   rd   rl  r.   ra   r0   rf   rg   s   @r2   r:  r:    si    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r1   r:  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitVisionMLPi`  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rA   rB   rU   r   r   activation_fnr	   r~   rE   r   fc1fc2rS   s     r2   rB   GitVisionMLP.__init__a  sb    #F$5$5699V//1I1IJ99V55v7I7IJr1   r%   r[   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r~  r}  r  r   s     r2   ra   GitVisionMLP.forwardh  s4    /**=9/r1   )r}  rU   r~  r  r   rg   s   @r2   r{  r{  `  s)    KU\\ ell  r1   r{  rF  r   r   r   r   scalingrN   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr>   r   )r   r   )pr#  r   rn   )r-   r   r   r	   r   r   float32r   r   rN   r#  r   )
rF  r   r   r   r   r  rN   kwargsattn_weightsattn_outputs
             r2   eager_attention_forwardr  p  s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r1   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )GitVisionAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rq  g      F)rA   rB   rU   rE   rT  rq   	num_headshead_dimrs   scaleattention_dropoutrN   	is_causalr	   r~   k_projv_projq_projout_projrS   s     r2   rB   GitVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar1   r%   r   causal_attention_maskr   r[   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   rn   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r7  )r  r  rN   )r   r  r  r  r   r  r  r   rU   r   r  r  ru   rv   r   r  r#  rN   re  r   r  )rT   r%   r   r  r   rs  r_   rT  querieskeysvaluesattention_interfacer  r  s                 r2   ra   GitVisionAttention.forward  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r1   )rU   rN   rT  r  r  r  r  r  r  r  r  )NNF)r(   r)   r*   r+   r,   rB   r-   re   r   r   r   ra   r0   rf   rg   s   @r2   r  r    s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r1   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )GitVisionEncoderLayeri  rU   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rA   rB   rE   rT  r  	self_attnr	   rJ   rK   layer_norm1r{  mlplayer_norm2rS   s     r2   rB   GitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr1   r%   r   r  r   r[   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r%   r   r  r   )r  r  r  r  )rT   r%   r   r  r   residualr  r   s           r2   ra   GitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr1   )rT  r  r  r  r  ry  )r(   r)   r*   r+   r   rB   r-   re   r   r   r   r.   ra   r0   rf   rg   s   @r2   r  r    sk    S S -2&||& &  %||	&
 $D>& 
u  	!& &r1   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )GitVisionEncoderi	  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`GitVisionEncoderLayer`].

Args:
    config: GitVisionConfig
rU   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rA   rB   rU   r	   r  r  r  r  layersr  )rT   rU   rt  rV   s      r2   rB   GitVisionEncoder.__init__  sT    mmERXRjRjLk$lLkq%:6%BLk$lm&+# %mr  r   r  r   r  r  r[   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr'   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r'   r  s     r2   r   +GitVisionEncoder.forward.<locals>.<genexpr>d  s     e$Sq$Ss   	r$   r%   r&   )rU   r   r  use_return_dictr%  r  r  r#  r&  r'  r)  r   )rT   rY   r   r  r   r  r  encoder_statesall_attentionsr%   idxencoder_layerr0  s                r2   ra   GitVisionEncoder.forward  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r1   )rU   r  r  )NNNNN)r(   r)   r*   r+   r,   r   rB   r   r-   re   r   r   r   r   ra   r0   rf   rg   s   @r2   r  r  	  s    , , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r1   r  c                      ^  \ rS rSrS\4U 4S jjr\     SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )GitVisionTransformerij  rU   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rA   rB   rU   rE   r:  r`   r	   rJ   rK   pre_layrnormr  encoderpost_layernorm)rT   rU   rT  rV   s      r2   rB   GitVisionTransformer.__init__l  sd    &&	-f5LL8M8MN'/ ll9:O:OPr1   rn  r   r  rl  r  r[   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nU(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nz You have to specify pixel_valuesrl  )rY   r   r  r  r   r   r  )rU   r   r  r  rs   r`   r  r  r  r   r%   r&   )	rT   rn  r   r  rl  r  r%   encoder_outputsr$   s	            r2   ra   GitVisionTransformer.forwardv  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r1   )rU   r`   r  r  r  NNNFN)r(   r)   r*   r+   r   rB   r   r   r-   r.   r   r   r   r   ra   r0   rf   rg   s   @r2   r  r  j  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r1   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    )custom_introc                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )GitVisionModeli  rn  rU   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rA   rB   r  vision_model	post_initrS   s     r2   rB   GitVisionModel.__init__  s'     08r1   r[   c                 B    U R                   R                  R                  $ r   )r  r`   r?  rT   s    r2   get_input_embeddings#GitVisionModel.get_input_embeddings  s      ++;;;r1   r   r  rl  r  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GitVisionModel

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = GitVisionModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```)rn  r   r  rl  r  )rU   r  r  )rT   rn  r   r  rl  r  s         r2   ra   GitVisionModel.forward  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r1   )r  r  )r(   r)   r*   r+   r   rI  main_input_namerB   r	   Moduler  r   r   r-   r.   r   r   r   r   ra   r0   rf   rg   s   @r2   r  r    s     #L$O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r1   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )GitProjectioni  rU   c                 .  > [         TU ]  5         Xl        [        R                  " [        R
                  " UR                  R                  UR                  5      [        R                  " UR                  UR                  R                  S95      U l
        g r   )rA   rB   rU   r	   
Sequentialr~   ry   rE   rJ   rK   visual_projectionrS   s     r2   rB   GitProjection.__init__  sd    !#IIf**668J8JKLL++1E1E1T1TU"
r1   r`   r[   c                 $    U R                  U5      $ r   )r  )rT   r`   s     r2   ra   GitProjection.forward  s    %%j11r1   )rU   r  )r(   r)   r*   r+   r   rB   r-   re   ra   r0   rf   rg   s   @r2   r  r    s/    
y 
2%,, 25<< 2 2r1   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                      ^  \ rS rSrU 4S jrS rS rS rS\S\	R                  S\	R                  S	\	R                  4S
 jrSS jr\            SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\\\\	R(                     4      S\\   S\\   S\\   S\S\\   S	\\\	R                     \4   4S jj5       rSrU =r$ )GitModeli  c                 r  >^ [         TU ]  T5        TU l        [        T5      U l        [        TR                  5      U l        [        T5      U l	        [        T5      U l        TR                  b8  [        R                  " U4S j[        TR                  5       5       5      U l        U R#                  5         g )Nc              3      >#    U  HE  n[         R                  " [        R                  " S S TR                  R
                  5      5      v   MG     g7f)r   N)r	   rU  r-   zerosry   rE   )r  rt  rU   s     r2   r   $GitModel.__init__.<locals>.<genexpr>  s=      ;?A U[[Av/C/C/O/OPQQ?s   AA)rA   rB   rU   r4   r`   r  ry   image_encoderr  r  r  r  r}   r	   ParameterListr  img_temperal_embeddingr  rS   s    `r2   rB   GitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r1   c                 .    U R                   R                  $ r   r`   rG   r  s    r2   r  GitModel.get_input_embeddings   s    ...r1   c                 $    XR                   l        g r   r  )rT   r   s     r2   set_input_embeddingsGitModel.set_input_embeddings  s    */'r1   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r   r   )rT   heads_to_pruner  r   s       r2   _prune_headsGitModel._prune_heads  s<    
 +002LELLu%//;;EB 3r1   r]   r   r   r[   c           	          [         R                  " [         R                  " XX2S9SS9nUR                  US:H  [	        S5      5      nU$ )Nr   r   r   )diagonal-inf)r-   triuonesmasked_fillfloat)rT   r]   r   r   masks        r2   _generate_future_maskGitModel._generate_future_mask  s=    zz%**TLWXY	5=9r1   c                 n   UR                   S   nUR                   S   nUR                  nUR                  n	[        R                  " Xw4XS9n
[        R
                  " XvU-   4[        S5      UR                  U	S9n[        R                  " Xg4U	UR                  S9nUS:  a?  [        R                  " UR                   S   UR                   S   U-   4U	UR                  S9n[        R                  " X4SS9n[        R                  " XR                  U	5      4SS9n[        R                  " X4SS9S S S 24   nUc2  [        R
                  " UR                   S   UR                   S   4SUS9nUR                  [        R                  :w  a  [        S	5      e[        R                  " XQR                  S
9n[        S5      UU'   UR                  UR                   S   Xv-   Xt-   U-   45      nUR                  5       nUS S 2S S 2S U24   nUS S 2S S S 24   nUU-   US S 2S S 2S U24'   US S 2S S S 2S S 24   nU$ )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r-   r  fullr  r   r   r   rs   
zeros_likerR   clone)rT   tgtmemorytgt_maskrZ   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r2   create_attention_maskGitModel.create_attention_mask  s0   ))A,\\!_
		;;
7TJJ#99:&M::	
	 kk!??
 "A%{{"HNN1$58N$NOH yy(0a8		9kk%&89qA#ii1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQZQZ![:?-67188$**1-z/CZEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r1   rX   r   r<   rn  r   rY   r  r  r   r  rl  r  c                 `   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUS   nSnUb:  [        U[        5      (       d  US   S   R                  S   OUR                  5       nU R                  XPR                   R                  5      nSnUb  UR                  S:X  a  U R                  XKS	9R                   nOUR                  S
:X  a  / n[#        UR                  S   5       HM  nU R                  USS2USS2SS24   US	9R                   nUU R$                  U   -  nUR'                  U5        MO     [(        R*                  " USS9nO[        S5      eU R-                  U5      nU R/                  UUUUS9nUcG  [(        R0                  " UR                  S   SUR                  S   4UR2                  UR4                  S9nUR7                  UR                  S5      UR                  S5      -  SS5      n[(        R*                  " UU4SS9nU R9                  UUR2                  UR4                  5      nU R;                  UUUUS9nUbk  [=        UUR2                  US   S9R?                  UR4                  5      nUS:  a  USS2SS2U* S2SS24   nO!USS2SS2US   * S2US   * S24==   U-  ss'   U RA                  UUUUUU	U
UUSLS9	nUS   nU(       d	  U4USS -   $ [C        UURD                  URF                  URH                  S9$ )a?  
Examples:

```python
>>> from transformers import AutoProcessor, AutoModel
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = AutoModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> text = "this is an image of two cats"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer>   z5You have to specify either input_ids or inputs_embedsr   r   rn      r     r   z#pixel_values must be of rank 4 or 5)rX   r<   rY   rZ   r   )r  r  r  rZ   )tgt_len)r   r   r  r  r   r  r  r   r"  )%rU   r   r  r  r  rs   %warn_if_padding_and_no_attention_maskr]   r   r   r   get_seq_lengthget_head_maskr  ndimr  r$   r  r  appendr-   r   r  r`   r  r   r   repeatr  r  r   r   r  r   r  r%   r&   )rT   rX   r   r<   rn  r   rY   r  r  r   r  rl  r  r^   r_   rZ   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr%   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r2   ra   GitModel.forwardF  s8   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU ^
 "#& "/599  "1%++A.$335 # &&y++2O2OP	$(!#  A%"&"4"4  #5 ###   ""a'"$!&|'9'9!'<!=I,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@A "> #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r1   )rU   r`   r  r  r  r  r   )NNNNNNNNNNFN)r(   r)   r*   r+   rB   r  r  r  rd   r-   r   r   re   r  r  r   r   r   r   r   r.   r   r   r   ra   r0   rf   rg   s   @r2   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r1   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                      ^  \ rS rSrS/rU 4S jrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\\\\
R                     4      S\	\   S\	\   S\	\   S\S\	\   S\\\
R                     \4   4S jj5       r SS jrS rSrU =r$ )GitForCausalLMi  zoutput.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
rA   rB   r  r5  r	   r~   rE   rD   r   r  rS   s     r2   rB   GitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r1   c                     U R                   $ r   r   r  s    r2   get_output_embeddings$GitForCausalLM.get_output_embeddings  s    {{r1   c                     Xl         g r   r+  )rT   new_embeddingss     r2   set_output_embeddings$GitForCausalLM.set_output_embeddings  s    $r1   rX   r   r<   rn  r   rY   labelsr  r  r   r  rl  r  r[   c                    Ub  UOU R                   R                  nUb  Sn	U R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      nSnUb  U R                  R                  R
                  S   R                  R                  R                  nUSS2US2SS24   R                  5       nUSS2SS24   R                  5       nU R                  " UR                  SU R                   R                  5      UR                  S5      4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Examples:

Image captioning example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
>>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_caption)
two cats sleeping on a pink blanket next to remotes.
```

Visual question answering (VQA) example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> from huggingface_hub import hf_hub_download
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

>>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
>>> image = Image.open(file_path).convert("RGB")

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> question = "what does the front of the bus say at the top?"

>>> input_ids = processor(text=question, add_special_tokens=False).input_ids
>>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
>>> input_ids = torch.tensor(input_ids).unsqueeze(0)

>>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
['what does the front of the bus say at the top? special']
```

Video captioning example:

```python
>>> import av
>>> import numpy as np
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoProcessor, AutoModelForCausalLM

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

>>> # set seed for reproducibility
>>> np.random.seed(45)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`List[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # load video
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample frames
>>> num_frames = model.config.num_image_with_embedding
>>> indices = sample_frame_indices(
...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
... )
>>> frames = read_video_pyav(container, indices)

>>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

>>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
```
NF)r   r<   rn  r   rY   r  r  r   r  rl  r  r   r>   r   rD   )losslogitsr  r%   r&   )rU   r  r5  r   r  r  r   rT   r|   r   loss_functionr   rD   r   r  r%   r&   )rT   rX   r   r<   rn  r   rY   r2  r  r  r   r  rl  r  r  r   r$  r5  r4  num_image_tokensshifted_logitsr   s                         r2   ra   GitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r1   c                     UbC  UR                  5       nUR                  S   U:  a  UnOUR                  S   S-
  nUS S 2US 24   nUR                  nUc  UR                  U5      nUUUR                  SS 5      UUS.$ )Nr   rn  )rX   r   rn  r  r  )r  r   new_onesget)	rT   rX   r  r   r  r  past_lengthremove_prefix_lengthr^   s	            r2   prepare_inputs_for_generation,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~t<."
 	
r1   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr'   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   r   )r  
past_statebeam_idxs     r2   r   0GitForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r)  )rT   r  rE  reordered_past
layer_pasts     `  r2   _reorder_cacheGitForCausalLM._reorder_cache  s8    )Jncmnn N * r1   )r5  r   )NNNNNNNNNNNFN)NNN)r(   r)   r*   r+   _tied_weights_keysrB   r,  r0  r   r   r-   re   r   r   r   r   r   r   ra   r?  rI  r0   rf   rg   s   @r2   r'  r'    s    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS
> r1   r'  )r'  r  r4  r  )r7  )Gr,   r   dataclassesr   typingr   r   r   r   r   r-   torch.utils.checkpointr	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_gitr   r   
get_loggerr(   ru   r!   r  r4   ri   r   r   r   r   r   r   r  r4  r:  r{  re   r  r  r  r  r  r  r  r  r  r'  __all__r'   r1   r2   <module>rZ     s      ! 9 9    ! . ) B  G l l  : 
		H	% ?; ? ?8-BII -`rryy rlBII   
/299 /fbii  		 .ryy .ba
 a
H * * *:P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %,L) L)`/BII /f^
ryy ^
B3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
{' {
{| Qr1   