
    fTh{*                    v   S r SSKrSSKJrJrJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*  \#" 5       (       a  SSK+J,r,  SSK-J.r.  \&R^                  " \05      r1 " S S\
Rd                  5      r3 SSK4J5r5  \5r3\1Rm                  S5        \Rt                  " \35         " S S\
Rd                  5      r; " S S\
Rd                  5      r< " S S\
Rd                  5      r= " S S\
Rd                  5      r> " S S \
Rd                  5      r?\" " S! S"\5      5       r@\" " S# S$\@5      5       rA " S% S&\
Rd                  5      rB " S' S(\
Rd                  5      rC " S) S*\
Rd                  5      rD " S+ S,\
Rd                  5      rE " S- S.\
Rd                  5      rF " S/ S0\
Rd                  5      rG\"" S1S29 " S3 S4\@5      5       rH\"" S5S29 " S6 S7\@\5      5       rI/ S8QrJg! \7 a     GNP\8 a    \1Rs                  S5         GNif = f)9zPix2Struct modeling file    N)DictListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Pix2StructLayerNorm=   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      j/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr)   Pix2StructLayerNorm.__init__>   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor+   float32powmeanrsqrtr.   r-   dtypefloat16bfloat16)r/   hidden_statesvariances      r3   forwardPix2StructLayerNorm.forwardF   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r5   )r.   r-   )gư>__name__
__module____qualname____firstlineno__r)   rD   __static_attributes____classcell__r2   s   @r3   r%   r%   =   s    $+ +r5   r%   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
Pix2StructVisionEmbeddingsf   a  
Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
is represented by a vector of `hidden_size` values.
configreturnNc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l
        [        R                  " UR                  5      U l        g N)r(   r)   r   Linearpatch_embed_hidden_sizer0   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr/   rR   r2   s     r3   r)   #Pix2StructVisionEmbeddings.__init__m   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r5   flattened_patchesc                     US S 2S S 2S4   R                  5       nUS S 2S S 2S4   R                  5       nUS S 2S S 2SS 24   nU R                  U5      nU R                  U5      nU R                  U5      nXE-   U-   nU R	                  U5      nU$ )Nr   r   r7   )longrX   r[   r\   r_   )r/   rb   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r3   rD   "Pix2StructVisionEmbeddings.forwardv   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  0>A
\\*-
r5   )r\   r_   rX   r[   )rG   rH   rI   rJ   __doc__r   r)   r+   TensorrD   rK   rL   rM   s   @r3   rP   rP   f   s<    7/ 7D 7 %,,  r5   rP   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )Pix2StructVisionAttention   c                 l  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        U R                  U R                  -  U l	        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        SU l        g NFbias)r(   r)   r0   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutr_   	inner_dimr   rV   querykeyvalueoutputgradient_checkpointingr`   s     r3   r)   "Pix2StructVisionAttention.__init__   s    !--"(++11//(?(?? YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r5   c                 N  ^ ^ UR                   SS u  mnUU 4S jnU" T R                  U5      5      nU" T R                  U5      5      n	U" T R                  U5      5      n
[        R
                  " XR                  SS5      5      nUGc  [        R                  " ST R                  Xf4UR                  UR                  S9nT R                  (       a  T R                  (       a  SUl        UR                  5       S:X  a)  X2SS2SSSS24   R                  UR                  5      -   nOyUb  X2R                  UR                  5      -   nOX[!        5       (       dI  [        R"                  " TU4UR                  UR                  S9nX2R                  UR                  5      -   nSU-
  nUR%                  US:H  [        R&                  " UR                  5      R(                  5      nX-  n[        R*                  " U[        R,                  " [        R&                  " UR                  5      R(                  5      5      n[.        R0                  R3                  US[        R4                  S	9R7                  U5      n[.        R0                  R9                  UT R8                  T R                  S
9nUb  X-  n[        R
                  " X5      nUR                  SS5      R;                  5       R=                  TST R>                  5      nT RA                  U5      nU4U4-   nU(       a  X4-   nU$ )z
Self-attention block
Nr7   c                    > U R                  5       R                  TSTR                  TR                  5      R	                  SS5      $ )
projectionr8   r   r7   )
contiguousviewrw   ru   	transpose)states
batch_sizer/   s    r3   to_projection_shape>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr5   r	   r   devicer?   Tr8   )dimr?   ptraining)!shaperz   r{   r|   r+   matmulr   zerosrw   r   r?   r~   r   requires_gradr   r:   r   r,   masked_fillfinfominmaxtensorr   
functionalsoftmaxr;   type_asr_   r   r   ry   r}   )r/   rB   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr   s   `               @r3   rD   !Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE l,@,@A,FG !KKDLL*9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-//!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,b &'9Lll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr5   )
r_   r~   r0   ry   r{   ru   rw   r}   rz   r|   )NNNFrF   rM   s   @r3   rn   rn      s"    ,& M Mr5   rn   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructVisionMlp   rR   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rq   r(   r)   r   rV   r0   d_ffwi_0wi_1wor]   r^   r_   r
   dense_act_fnactr`   s     r3   r)   Pix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r5   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rU   r   r   r   r_   
isinstancer   r-   r+   rl   r?   int8r:   r/   rB   hidden_geluhidden_linears       r3   rD   Pix2StructVisionMlp.forward       hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r5   r   r_   r   r   r   )	rG   rH   rI   rJ   r!   r)   rD   rK   rL   rM   s   @r3   r   r      s    /5 / r5   r   c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\\R                     S\\R                     S	\	S\
\\R                  \R                  4   \\R                     4   4
S
 jjrSrU =r$ )Pix2StructVisionLayeri	  rR   rS   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r1   )r(   r)   chunk_size_feed_forwardseq_len_dimrn   	attentionr   mlpr%   r0   layer_norm_epspre_mlp_layer_normpre_attention_layer_normr`   s     r3   r)   Pix2StructVisionLayer.__init__
  ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r5   rB   r   	head_maskr   c                     UnU R                  U5      nU R                  UUUUS9nUS   nUSS  nXu-   nU R                  U5      n	U R                  U	5      U-   n	U	4U-   nU$ )N)r   r   r   r   r   )r   r   r   r   )
r/   rB   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r3   rD   Pix2StructVisionLayer.forward  s     ! 55mD!%)%/	 "0 "
 2!4(, )3 ..}=xx-=/G+r5   )r   r   r   r   r   r   )NNF)rG   rH   rI   rJ   r   r)   r+   rl   r   boolr   r   rD   rK   rL   rM   s   @r3   r   r   	  s    k/ kD k 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	F r5   r   c                      ^  \ rS rSrS\SS4U 4S jjr     SS\R                  S\\R                     S\\R                     S	\	S
\	S\	S\
\\4   4S jjrSrU =r$ )Pix2StructVisionEncoderi4  rR   rS   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r(   r)   rR   r   
ModuleListrangenum_hidden_layersr   layerr~   )r/   rR   _r2   s      r3   r)    Pix2StructVisionEncoder.__init__5  sT    ]]5QWQiQiKj#kKja$9&$AKj#kl
&+# $ls   A&rB   r   r   r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       H{  u  pU(       a  Xq4-   nUb  X9   OS nU R                  (       a1  U R                  (       a   U R	                  U
R
                  UUUU5      nO	U
" XX5      nUS   nU(       d  Ms  XS   4-   nM}     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frU   r   .0vs     r3   	<genexpr>2Pix2StructVisionEncoder.forward.<locals>.<genexpr>a  s     m$[q$[s   	last_hidden_staterB   
attentions)	enumerater   r~   r   _gradient_checkpointing_func__call__tupler   )r/   rB   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r3   rD   Pix2StructVisionEncoder.forward;  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]O o)!,M  &91=M<O&O#)  5,   14D Dm]GZ$[mmm++*
 	
r5   )rR   r~   r   )NNFFT)rG   rH   rI   rJ   r   r)   r+   rl   r   r   r   r   r   rD   rK   rL   rM   s   @r3   r   r   4  s    ,/ ,D , 26,0"'%* +
||+
 !.+
 ELL)	+

  +
 #+
 +
 
uo%	&+
 +
r5   r   c                   <    \ rS rSr\rSrSr\S 5       r	S r
S rSrg)	Pix2StructPreTrainedModelii  TFc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r+   r   r   r   )r/   r   
input_maskdummy_inputss       r3   r   &Pix2StructPreTrainedModel.dummy_inputso  s6    LL.	\\*-
!*"&0

 r5   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        5      (       Gaf  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R                  R
                  R                  SX#S-  -  S9  [        UR                  S5      (       aE  UR                  R                  b.  UR                  R                  R
                  R!                  5         UR"                  R                  R
                  R                  SX#S-  -  S9  [        UR"                  S5      (       aE  UR"                  R                  b.  UR"                  R                  R
                  R!                  5         UR$                  R                  R
                  R                  SX$S-  -  S9  [        UR$                  S5      (       aG  UR$                  R                  b/  UR$                  R                  R
                  R!                  5         ggg[        U[&        5      (       Ga  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R(                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R*                  OU R                   R*                  nUR,                  R                  R
                  R                  SX#U-  S-  -  S9  UR.                  R                  R
                  R                  SX#S-  -  S9  UR0                  R                  R
                  R                  SX#S-  -  S9  UR2                  R                  R
                  R                  SX&U-  S-  -  S9  UR4                  (       a4  UR6                  R                  R
                  R                  SX#S-  -  S9  gg[        U[8        R:                  5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R
                  R                  SX#S-  -  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         gg[        U[>        5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR@                  R                  R
                  R                  SX#S-  -  S9  g[        U[8        RB                  [8        RD                  45      (       a  [8        RF                  RI                  UR                  R
                  RK                  [L        RN                  5      SU R                   RP                  S9RK                  UR                  RR                  5      UR                  l        UR                  b%  UR                  R
                  R!                  5         gg[        U[        5      (       a4  UR                  b&  UR                  R
                  R                  S5        gg[        U[8        R:                  5      (       ax  UR                  R
                  R                  SU R                   RP                  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         ggg)zInitialize the weights      ?        g      )r=   stdrs   N)*rR   initializer_factorr   r%   r-   datafill_ Pix2StructTextDenseGatedActDenser   text_configr0   r   r   normal_hasattrrs   zero_r   r   Pix2StructTextAttentionrt   	num_headsrz   r{   r|   r}   has_relative_attention_biasrelative_attention_biasr   rY   padding_idxPix2StructTextModellm_headrV   Conv2dinittrunc_normal_r:   r+   r;   initializer_ranger?   )r/   modulefactorr0   r   ru   rw   s          r3   _init_weights'Pix2StructPreTrainedModel._init_weightsz  s   //f122MM$$Vc\2 @AA dkk+;<< ''33[[,, 
 4>dkkK[3\3\4;;**//bfbmbmbrbrDKK##++&UYDY:Z+[v{{F++0@0@0L  %%++-KK##++&UYDY:Z+[v{{F++0@0@0L  %%++-II!!))sD.8Q)Rvyy&))fiinn.H		##))+ /I) 788
 dkk+;<< ''33[[,,  1;4;;HX0Y0Y'',,_c_j_j_v_v 
 dkk+;<< ''11[[**  LL$$,,#6TfFfkoEo;p,qJJ""**PTCT9U*VLL$$,,#6RVEV;W,XMM  %%--3FQcGchlFl<m-n11..55::BBQWlp[pQqBr 2-- dkk+;<< ''33[[,,  MM&&CVPT?T5U&V!!-""6#5#56<<> . 344 dkk+;<< ''33[[,,  NN!!&&..CVX\G\=].^BII 677 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( ' 344}}(""((- )--MM&&CT[[5R5R&S!!-""6#5#56<<> . .r5   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r8   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rR   decoder_start_token_idpad_token_id
ValueErrorr   r+   fullr   cat	new_zerosclonemasked_fill_)r/   r   r  r  shifted_input_idss        r3   _shift_right&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)<  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r5   r   N)rG   rH   rI   rJ   r   config_class_supports_cache_class_supports_static_cachepropertyr   r  r%  rK   r   r5   r3   r   r   i  s1    #L " M?`!r5   r   c                     ^  \ rS rSr\rSrSrS/rS\	4U 4S jjr
S rS\\\\   4   S	S
4S jr\      SS\\R&                     S\\R&                     S\\R&                     S\\   S\\   S\\   S	\\\4   4S jj5       rSrU =r$ )Pix2StructVisionModeli  rb   Tr   rR   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  S9U l
        U R                  5         g Nr   )r(   r)   rR   rP   rg   r   encoderr%   r0   r   	layernorm	post_initr`   s     r3   r)   Pix2StructVisionModel.__init__  sS     4V<.v6,V-?-?VEZEZ[ 	r5   c                 .    U R                   R                  $ rU   )rg   rX   r/   s    r3   get_input_embeddings*Pix2StructVisionModel.get_input_embeddings  s    ///r5   heads_to_prunerS   Nc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr/  r   r   prune_heads)r/   r7  r   headss       r3   _prune_heads"Pix2StructVisionModel._prune_heads  s<    
 +002LELLu%//;;EB 3r5   r   r   r   r   r   c           	      &   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUc   UR                  SS9S:g  R                  5       nU R                  X0R                   R                  5      nU R                  U5      nU R                  UUUUUUS9nUS   n	U R                  U	5      n	U(       d
  U	4n
XSS -   $ [        U	UR                  UR                  S9$ )	a  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
    Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
    [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
    paper](https://arxiv.org/abs/2210.03347) (figure 5) for more details.

Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, Pix2StructVisionModel

>>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 2048, 768]
```
Nz%You have to specify flattened_patchesr8   r  r   )r   r   r   r   r   r   r   )rR   r   r   use_return_dictr  sumfloatget_head_maskr   rg   r/  r0  r   rB   r   )r/   rb   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r3   rD   Pix2StructVisionModel.forward  s4   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN &&y++2O2OP	??+<=,,)/!5# ' 
 *!,..9+-L!""555-)77&11
 	
r5   )rR   rg   r/  r0  )NNNNNN)rG   rH   rI   rJ   r!   r'  main_input_namesupports_gradient_checkpointing_no_split_modulesr   r)   r5  r   intr   r<  r   r   r+   rl   r   r   r   r   rD   rK   rL   rM   s   @r3   r,  r,    s    )L)O&*#01
/ 
0C4T#Y+? CD C  5915,0,0/3&*N
#ELL1N
 !.N
 ELL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
r5   r,  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )r  iW  rR   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rq   r   r`   s     r3   r)   )Pix2StructTextDenseGatedActDense.__init__X  r   r5   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rU   r   r   s       r3   rD   (Pix2StructTextDenseGatedActDense.forward`  r   r5   r   	rG   rH   rI   rJ   r    r)   rD   rK   rL   rM   s   @r3   r  r  W  s    /3 / r5   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructTextLayerFFit  rR   c                    > [         TU ]  5         [        U5      U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r.  )r(   r)   r  DenseReluDenser%   r0   layer_norm_epsilon
layer_normr   r]   r^   r_   r`   s     r3   r)   Pix2StructTextLayerFF.__init__u  sK    >vF-f.@.@fF_F_`zz&"5"56r5   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rU   )rW  rU  r_   )r/   rB   forwarded_statess      r3   rD   Pix2StructTextLayerFF.forward}  s;    ??=9../?@%5E(FFr5   )rU  r_   rW  rQ  rM   s   @r3   rS  rS  t  s    73 7 r5   rS  c                   z   ^  \ rS rSr S	S\S\\   4U 4S jjjr\S
S j5       r	SS jr
         SS jrSrU =r$ )r
  i  rR   	layer_idxc                   > [         TU ]  5         X l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        X0l        Uc-  [        R                  SU R                   R"                   S35        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        U R                  (       a0  [$        R0                  " U R                  U R                  5      U l        [5        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frr   )r(   r)   r  relative_attention_num_bucketsrelative_attention_max_distancer0   rt   ru   r  rw   r^   r_   ry   r]  loggerwarning_oncer2   rG   r   rV   rz   r{   r|   r}   rY   r  setpruned_headsr~   r/   rR   r  r]  r2   s       r3   r)    Pix2StructTextAttention.__init__  so    	+F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(E&+#r5   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r7   r   )r:   r+   rd   absr   
zeros_likelogrA  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r3   _relative_position_bucket1Pix2StructTextAttention._relative_position_bucket  s   . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r5   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  USU R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r?   r   F)ro  rp  rq  )r7   r   r   r   )r  r-   r   r+   arangerd   r:   rv  r_  r`  permute	unsqueeze)
r/   query_length
key_lengthr   cache_positioncontext_positionmemory_positionrn  relative_position_bucketvaluess
             r3   compute_bias$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A;;==	 $B $
  --.FG	*44Q7r5   c                    UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbE  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nU(       a  UOUnU(       aA  U(       a:  W(       a3  WR                  U R                     nUR                  U R                     nOU R                  U5      nU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
WR                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [         R"                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R$                  (       db  [         R&                  " SU R                  UU4UR(                  UR*                  S	9nU R,                  (       a  U R.                  (       a  SUl        O.U R3                  UUUR(                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R4                  (       aS  [         R6                  " UR                   S   5      nSU[9        U R4                  5      '   USS2UR;                  5       4   nOUnUU-  n[<        R>                  RA                  URC                  5       SS9RE                  U5      n[<        R>                  RG                  UU RF                  U R.                  S9nUb  UU-  n[         R"                  " UU5      nUR                  SS5      RI                  5       nUR                  USU RJ                  5      nU RM                  U5      nUXT4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr7   r8   r   r~  Tr	   r   )r   r~  r   r  r   )'r   rz   r   rw   ru   r   
is_updatedgetr]  cross_attention_cacheself_attention_cache	key_cachevalue_cacher{   r|   updater+   r   r  r   r   r?   r~   r   r   r  rd  r,   listr   r   r   r   rA  r   r_   r   ry   r}   )r/   rB   maskkey_value_statesr   past_key_valuer   r|  	use_cacher   r~  r   r   is_cross_attentionr   r  curr_past_key_valuecurrent_statesr   r   r   r}  real_seq_lengthcausal_maskr   r   r   r   s                               r3   rD   Pix2StructTextAttention.forward  s   $ "/!4!4Ra!8
 .T9zz-0#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)].Z,66t~~FJ.::4>>JL.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fkk+.>/Gr5   )r_   r~   r  r0   ry   r{   ru   r]  rw   r}   rd  rz   r  r`  r_  r|   FN)T       )NN)	NNNNNNFFN)rG   rH   rI   rJ   r    r   rK  r)   staticmethodrv  r  rD   rK   rL   rM   s   @r3   r
  r
    si    jn,*,ZbcfZg, ,> -  - `0 i ir5   r
  c                   R   ^  \ rS rSrSS\\   4U 4S jjjr       SS jrSrU =r	$ ) Pix2StructTextLayerSelfAttentioniX  r]  c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr  r]  r   r(   r)   r
  r   r%   r0   rV  rW  r   r]   r^   r_   re  s       r3   r)   )Pix2StructTextLayerSelfAttention.__init__Y  sR    0W`
 .f.@.@fF_F_`zz&"5"56r5   c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r  r   r   r  r  r   r~  r   r   rW  r   r_   )r/   rB   r   r   r   r  r  r   r~  normed_hidden_statesr   r   s               r3   rD   (Pix2StructTextLayerSelfAttention.forwarda  sr      $}=>> '+)/) * 	
 &5Ea5H(II "%5ab%99r5   r   r_   rW  r  )NNNNFFN
rG   rH   rI   rJ   r   rK  r)   rD   rK   rL   rM   s   @r3   r  r  X  s:    7XVY] 7 7  r5   r  c                   T   ^  \ rS rSrSS\\   4U 4S jjjr        SS jrSrU =r	$ )!Pix2StructTextLayerCrossAttentioni}  r]  c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   r  )r/   rR   r]  r2   s      r3   r)   *Pix2StructTextLayerCrossAttention.__init__~  sP    0UZfop-f.@.@fF_F_`zz&"5"56r5   c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r  r  r   r   r  r  r|  r   r~  r   r   r  )r/   rB   r  r   r   r   r  r  r|  r   r~  r  r   r   r   s                  r3   rD   )Pix2StructTextLayerCrossAttention.forward  sw      $}=>> -'+)%/) * 
 %||4DQ4G'HH/$4QR$88r5   r  rU   )NNNNFNFNr  rM   s   @r3   r  r  }  s<    7(3- 7 7  r5   r  c                   \   ^  \ rS rSrSS\\   4U 4S jjjr            SS jrSrU =r	$ )Pix2StructTextBlocki  r]  c                    > [         TU ]  5         [        UUUS9U l        [	        UUS9U l        [        U5      U l        g )Nr  )r]  )r(   r)   r  self_attentionr  encoder_decoder_attentionrS  r   re  s       r3   r)   Pix2StructTextBlock.__init__  sH    >(C
 *K*
&
 )0r5   c                 P   U R                  UUUUU	U
UUS9nUS S u  pUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUS LnU(       a  U R                  UUUUUU	US   S-   U
US9	nUS S u  pUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                  U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4nU
(       a  UU	4-   U-   nU$ UU-   nU$ )N)r   r   r   r  r  r   r~  r7   i  )r   r   r8   r   )r  r   r   r   r  r|  r  r   )r  r?   r+   r@   isinfanyr   r   clampr  r   )r/   rB   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr  r  r   r   r~  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r3   rD   Pix2StructTextBlock.forward  s#     "&!4!4)'+)/) "5 	"
 )?r(B%2126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; :-+B/!3#"3 'E 
'# -DBQ,G)M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM " 114EEG   11Gr5   )r  r   r  r  )NNNNNNNNFFTNr  rM   s   @r3   r  r    sK    1XVY] 1 1& "#&*#'H Hr5   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #         ^  \ rS rSr\rS/rS/rSrU 4S jr	S r
S rS rS	 rS
 r\              S%S\\R$                     S\\R&                     S\\R&                     S\\R&                     S\\R$                     S\\R&                     S\\R(                     S\\\\R&                           S\\   S\\   S\\   S\\R$                     S\\   S\\R$                     S\\\R&                  S4   \4   4S jj5       r S&S\\R(                  S4   S\R(                  S\R(                  S\S\4
S jjr\S\R(                  S\S \S!\R<                  S\R(                  S"\4S# j5       rS$r U =r!$ )'r  i  r  zlm_head.weightTc                 R  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R                   " UR"                  5      U l        [        R&                  " UR
                  UR                  SS9U l        U R+                  5         SU l        g s  snf )Nr   r  r   Frr   )r(   r)   r   rY   
vocab_sizer0   embed_tokensr   r   
num_layersr  r   r   r%   rV  final_layer_normr]   r^   r_   rV   r  r1  r~   )r/   rR   r   r2   s      r3   r)   Pix2StructTextModel.__init__
  s     LL):):F<N<NO]] v0011A $FQRSV`ab1

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   (!D$c           	         Uc  [         R                  S5        U$ SnU H  nSnU H2  nUUR                  SUR                  UR                  5      5      4-   nM4     US   R
                  US   R
                  :w  a,  [        SUS   R
                   SUS   R
                   S35      e[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eX54-   nM     U$ )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )ra  warningindex_selectr:   r   r   r  len)r/   past_key_valuesbeam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r3   _reorder_cache"Pix2StructTextModel._reorder_cache  sf    "NNef""!#!0 +-'$5 .I$11!X[[AQAXAX5YZM /+ %6 +1-337H7K7Q7QQ ;<WXY<Z<`<`;a  bB  CT  UV  CW  C]  C]  B^  ^i  j  ./37H3II <SA\=]<^^  AD  EV  AW  @X  Xc  d  &<>\%\"' "1( &%r5   c                     U R                   $ rU   r  r4  s    r3   r5  (Pix2StructTextModel.get_input_embeddings<  s       r5   c                     Xl         g rU   r  r/   new_embeddingss     r3   set_input_embeddings(Pix2StructTextModel.set_input_embeddings?  s    *r5   c                     U R                   $ rU   r  r4  s    r3   get_output_embeddings)Pix2StructTextModel.get_output_embeddingsB      ||r5   c                     Xl         g rU   r  r  s     r3   set_output_embeddings)Pix2StructTextModel.set_output_embeddingsE  s    %r5   r   r   r  r  inputs_embedsr   cross_attn_head_maskr  r  r   r   labelsr   r~  rS   .c                 n   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb&  UR                  5       nUR                  SUS   5      nO"Ub  UR                  5       SS nO[        S5      eUc%  U R                  c   S5       eU R                  U5      nUu  nnSnSnU	(       d  Ub  [        U[        5      (       a,  [        U[        5      (       d  Sn[        U[        5       5      nOb[        U[        5      (       d.  Sn[        R                  S5        [        R                  " U5      nOUc  [        [        5       [        5       5      nS	nUb  US	   nOUb  UR!                  5       nUc#  ["        R$                  " UUU-   UR&                  S
9nUc8  Ub  UR!                  5       U-   OUn["        R(                  " UUUR&                  S
9nU R                   R*                  (       a%  U R-                  UUUUb  UR.                  OSU
5      nOVUSS2SSSS24   nUR1                  UR2                  S9nSU-
  ["        R4                  " UR2                  5      R6                  -  nUbL  UR                  5       u  nnnUU4nUc  ["        R(                  " UUR&                  S
9nU R9                  U5      nOSnU R;                  X`R                   R<                  5      nU R;                  XpR                   R<                  5      nU(       a  SOSnU
(       a  SOSnU
(       a  SOSnSn Sn!U R?                  U5      n"[A        U RB                  5       H  u  n#n$UU#   n%UU#   n&U(       a  UU"4-   nU RD                  (       aW  U RF                  (       aF  U	(       a  [        RI                  S5        Sn	U RK                  U$RL                  U"UU UUU!U%U&SU	U
U5      n'OU$" U"UU UUU!U%U&UU	U
US9n'U	SL a  U'SS S-   U'SS -   n'U'SS u  n"n(U'S   n Ub  U'U
(       a  SOS   n!U
(       d  M  UU'S   4-   nUc  M  UU'S   4-   nM     U RO                  U"5      n"U R?                  U"5      n"U RQ                  U"5      n)U(       a  UU"4-   nSn*Ub  UR1                  U)R&                  5      n[R        RT                  " SSS9n+U+" U)RW                  5       R                  SU)R                  S5      5      URW                  5       R                  S5      5      n*U	(       a  W(OSn,U(       a  UR.                  n,U(       a  URY                  5       n,U(       d  [[        S U*U)U,UUU4 5       5      $ []        U*U)U,UUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
    embeddings so you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
    Training](./t5#training).
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoProcessor, Pix2StructTextModel

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer8   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsFTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r?   r   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)r   r   r  r  r  r   r  r  r  r   r~  r   rU   r7      r	      r  r=   )ignore_index	reductionc              3   0   #    U  H  nUc  M  Uv   M     g 7frU   r   r   s     r3   r   .Pix2StructTextModel.forward.<locals>.<genexpr>/  s"      A  s   	)losslogitsr  rB   r   cross_attentions)/rR   r  r   r   r?  r  sizer   r  r   r   r   r   ra  rb  from_legacy_cacheget_seq_lengthr+   ry  r   r,   
is_decoder_update_causal_maskr  r:   r?   r   r   invert_attention_maskrB  r  r_   r   r   r~   r   r  r   rD   r  r  r   CrossEntropyLossr   to_legacy_cacher   r   )-r/   r   r   r  r  r  r   r  r  r  r   r   r  r   r~  kwargsinput_shaper   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthr  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  rB   r   r   r   r  r   next_decoder_cacher  r  loss_fct
next_caches-                                                r3   rD   Pix2StructTextModel.forwardH  s   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J $&+#3/511*_Vi:j:j.2+"5o|~"V1DEE&*###`
 #6"G"G"X ("5lnln"U!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!228G8S44Y]!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7rd(,%]3(4OA|'lO)=a)@&#$58H$H!**t}}NNt !&I $ A A ((!!)31#.%"!  !-!#."/*?+J2O$3/I#2'&7#1!" E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M$00=CTaZ[0\-  !/=3C2E!E(4+?=QRCSBU+U(y  5| --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD+4'$
&(==J(88:J  %"(   1&+%1
 	
r5   r"   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r   is_trainingr   r8   )sequence_lengthtarget_lengthr?   r~  r   )cudaxpunpu)rR   _attn_implementationr  r   r+   rl   r#   r  is_compileabler   _ignore_causal_mask_sdpar   r?   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r/   r   r  r~  r  r   past_seen_tokensusing_compilable_cacher?   r  r  r  	min_dtypes                r3   r  'Pix2StructTextModel._update_causal_maskE  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr5   r  r  r?   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  )
fill_valuer?   r   r   )diagonalr  r8   r   )r   r+   r   r   r  r   triury  reshapeexpandr"  r   r:   r   )r   r  r  r?   r~  r   r  r  r   mask_lengthpadding_masks              r3   r  IPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r5   )r_   r  r  r~   r   r  )NNNNNNNNNNNNNN)F)"rG   rH   rI   rJ   r    r'  rJ  _tied_weights_keysrI  r)   r  r5  r  r  r  r   r   r+   
LongTensorFloatTensorrl   r   r   r   r   rD   r   r  r  rK  r?   r  rK   rL   rM   s   @r3   r  r    sv    (L./*+&*#,(&<!+&  156:=A>B48157;EI$(,0/3-1&*59y
E,,-y
 !!2!23y
  ((9(9:	y

 !)):): ;y
   0 01y
 E--.y
 'u||4y
 "%e.?.?(@"ABy
 D>y
 $D>y
 'tny
 ))*y
 d^y
 !!1!12y
" 
uU&&+,.OO	P#y
 y
D #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r5   r  zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &         ^  \ rS rSr\rSrS/rS\4U 4S jjrS r	S r
S\R                  4S	 jrS
 rS rS r\                SS\\R(                     S\\R(                     S\\R*                     S\\R,                     S\\R(                     S\\R(                     S\\R.                     S\\\\R(                           S\\\\R(                           S\\R*                     S\\R.                     S\\   S\\   S\\   S\\   S\\R*                     S\\\R(                     \4   4"S jj5       rSrU =r$ )"Pix2StructForConditionalGenerationi  rb   zdecoder.lm_head.weightrR   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        U R                  5         g rU   )
r(   r)   r,  vision_configr/  r  r  decoderis_vqar1  r`   s     r3   r)   +Pix2StructForConditionalGeneration.__init__  sK     ,V-A-AB*6+=+=>mm 	r5   c                 6    U R                   R                  5       $ rU   )r2  r5  r4  s    r3   r5  7Pix2StructForConditionalGeneration.get_input_embeddings  s    ||0022r5   c                 :    U R                   R                  U5        g rU   )r2  r  r  s     r3   r  7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r5   rS   c                 6    U R                   R                  5       $ rU   )r2  r  r4  s    r3   r  8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r5   c                 :    U R                   R                  U5        g rU   )r2  r  r  s     r3   r  8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r5   c                     U R                   $ rU   )r2  r4  s    r3   get_decoder.Pix2StructForConditionalGeneration.get_decoder  r  r5   c                     U R                   $ rU   )r/  r4  s    r3   get_encoder.Pix2StructForConditionalGeneration.get_encoder  r  r5   r   r   r   r   decoder_head_maskr  rD  r  r  decoder_inputs_embedsr  r   r   r   r~  c                 <   Ub  UOU R                   R                  R                  nUb  UOU R                   R                  nUc  U R	                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU
bX  UcU  UcR  U R                  U
5      nUb  UO2UR                  U R                   R                  5      R                  5       nSUSS2S4'   U R                  UUUU	UUUUUUUU
UUS9nU(       d  UU-   $ [        UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR"                  UR$                  S9	$ )	a}  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
    Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
    `num_channels` * `patch_size` * `patch_size`

    The process of flattening the pixel patches is done by `Pix2StructProcessor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss for the decoder.

Example:

Inference:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> # autoregressive generation
>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A stop sign is on a street corner.

>>> # conditional generation
>>> text = "A picture of"
>>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A picture of a stop sign with a red stop sign
```

Training:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A stop sign is on the street corner."

>>> inputs = processor(images=image, return_tensors="pt")
>>> labels = processor(text=text, return_tensors="pt").input_ids

>>> # forward pass
>>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss
>>> print(f"{loss.item():.5f}")
5.94282
```N)rb   r   r   r   r   r   r   r   r7   r   )r   r   r  r  r  r  r   r  r  r   r   r  r   r~  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rR   r  r  r?  r/  r   r   r  r%  ner  rA  r2  r   r  r  r  rB   r   r  r   )r/   rb   r   r   r   r   rC  r  rD  r  r  rD  r  r   r   r   r~  rB   decoder_outputss                      r3   rD   *Pix2StructForConditionalGeneration.forward  s   d "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1'!5/!5#) ' 
" "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r5   )r2  r/  r3  )NNNNNNNNNNNNNNNN)rG   rH   rI   rJ   r   r'  rH  r+  r)   r5  r  r   Moduler  r  r>  rA  r   r   r+   r-  r,  
BoolTensorrl   r   r   r   r   rD   rK   rL   rM   s   @r3   r/  r/    s	    $L)O23	/ 	3:4ryy 4;  :>6:8<=A159=7;EIEI-18<$(,0/3&*59#q
#E$5$56q
 !!2!23q
 $E$4$45	q

 !))9)9 :q
 E--.q
 $E$5$56q
 'u||4q
 "%e.?.?(@"ABq
 "%e.?.?(@"ABq
 ))*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!1!12#q
$ 
uU&&');;	<%q
 q
r5   r/  )r   r/  r,  r  )Krk   rk  typingr   r   r   r   r   r+   torch.utils.checkpointr   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   r   configuration_pix2structr   r    r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrG   ra  rM  r%   apex.normalizationrN   infoImportError	Exceptionr  appendrP   rn   r   r   r   r   r,  r  rS  r
  r  r  r  r  r/  __all__r   r5   r3   <module>rc     s;     5 5    ! C C ) >  . 1   e d  !!;J 
		H	%+")) +2	/&
KKij   / 0! !H^		 ^D")) :(BII (V2
bii 2
j y! y! y!x l
5 l
 l
`ryy :BII  Pbii Ph!ryy !J#		 #LY")) Yx 
{3 {
{| 
T
)BO T

T
nE2  	 	
NN_`	s   5H H8H87H8