
    fTh                     2   S r SSKrSSKrSSKJrJrJr  SSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJr  SSKJr  \R8                  " \5      r " S S\R>                  5      r  " S S\R>                  5      r! " S S\RD                  5      r# " S S\RD                  5      r$ " S S\RD                  5      r%\ " S S\5      5       r& " S S\&5      r'\" SS9 " S S \&5      5       r(\" S!S9 " S" S#\&\5      5       r)S#S/r*g)$z/PyTorch TrOCR decoder model (based on RoBERTa).    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )TrOCRConfigc                      ^  \ rS rSrSrS\S\4U 4S jjrSS\R                  S\S\R                  4U 4S	 jjjr	S
r
U =r$ )TrOCRLearnedPositionalEmbedding)   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr   r   	__class__s      `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/trocr/modeling_trocr.pyr   (TrOCRLearnedPositionalEmbedding.__init__.   s"     ++5}E    	input_idspast_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr(   expand	unsqueezer   forwardr   )r   r#   r$   r%   bszseq_lenr   s         r    r1   'TrOCRLearnedPositionalEmbedding.forward4   s     $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r"   )r   r   N)__name__
__module____qualname____firstlineno____doc__intr   r+   Tensorr1   __static_attributes____classcell__r   s   @r    r   r   )   sH    Fs F3 F; ;s ;^c^j^j ; ;r"   r   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )TrOCRScaledWordEmbeddingC   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r   r   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r   r   rD   )r   r   r   rC   rD   r   s        r    r   !TrOCRScaledWordEmbedding.__init__H   s    D&r"   r#   c                 <   > [         TU ]  U5      U R                  -  $ rF   )r   r1   rD   )r   r#   r   s     r    r1    TrOCRScaledWordEmbedding.forwardL   s    wy)D,<,<<<r"   rD   )      ?)r6   r7   r8   r9   r:   r;   r   floatr   r+   r<   r1   r=   r>   r?   s   @r    rA   rA   C   sJ    's '3 'S '_ghm_n ' '= = =r"   rA   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjr\SS\S\S\\   4S jj5       r	\
R                  " 5       SS	\
R                  S
\4S jj5       r SS	\
R                  S\S
\\   4S jjrSrU =r$ )"TrOCRSinusoidalPositionalEmbeddingP   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   rC   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XU5      U l        U R                  S[        R                  " S5      5        g )Nr   _float_tensorr   )
r   r   r   r   rC   get_embeddingweightsregister_bufferr+   FloatTensor)r   rP   r   rC   r   s       r    r   +TrOCRSinusoidalPositionalEmbedding.__init__S   sP    *&))-T_e.?.?.BCr"   r   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   )r'   r   dimr)   N)mathlogr+   expr,   int64rL   r0   catsincosviewzerostoget_default_dtype)r   r   rC   half_dimembs        r    rS   0TrOCRSinusoidalPositionalEmbedding.get_embedding[   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r"   r#   r$   c                 J   UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nU R
                  b  X`R
                  R                  S5      :  a+  U R                  X`R                  U R                  5      U l        U R
                  R                  U R                  5      U l        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       nU$ )Nr   r   r)   )size"create_position_ids_from_input_idsrC   rd   r(   rT   rS   r   rR   index_selectrb   detach)r   r#   r$   r2   r3   r%   max_posxs           r    r1   *TrOCRSinusoidalPositionalEmbedding.forwardn   s     ~~'>>yJZJZ\rsvv

 ""Q&0<<7\\->->q-A#A--g7I7I4K[K[\DL||t'9'9:LL%%a):):2)>?DDSSUV]]_r"   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
r   rY   )ner;   r+   cumsumtype_asr-   )r   r#   rC   r$   maskincremental_indicess         r    rk   ETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r"   )r   r   rC   rT   rF   )r   )r6   r7   r8   r9   r:   r;   r   r   staticmethodrS   r+   no_gradr<   r1   rk   r=   r>   r?   s   @r    rN   rN   P   s    NDc D# DHUXM D D 1c 1# 1HUXM 1 1$ ]]_ s  & bc
8
847
8QYZ]Q^
8 
8r"   rN   c                     ^  \ rS rSrSr      SS\S\S\\   S\\   S\S\S	\S
\4U 4S jjjr	S\
R                  S\S\4S jr     SS\
R                  S\\
R                     S\\\
R                        S\\
R                     S\\
R                     S\S\\
R                  \\
R                     \\\
R                        4   4S jjrSrU =r$ )TrOCRAttention   z>Multi-headed attention from 'Attention Is All You Need' paper.	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attentionc
                 &  > [         T
U ]  5         X l        Ub  UOUU l        Ub  UOUU l        X0l        X`l        X#-  U l        U R                  U-  U R                  :X  d  [        SU R                   SU S35      eU R                  S-  U l	        Xpl
        [        R                  " U R                  X(S9U l        [        R                  " U R                  X(S9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r   )r   r   r}   r   r   r~   r   head_dim
ValueErrorscalingr   r   Lineark_projv_projq_projout_proj)r   configr}   r~   r   r   r   r   r   r   r   s             r    r   TrOCRAttention.__init__   s     	" ,D)	 ,D)	"!.	)T^^;MdnnM] ^;b"  }}d*$ii		9@ii		9@ii	4@		)TBr"   tensorr3   r2   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )rb   r~   r   	transpose
contiguous)r   r   r3   r2   s       r    _shapeTrOCRAttention._shape   s5    {{3GQQRSUVWbbddr"   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 	   USLnUR                  5       u  pn
U R                  U5      U R                  -  nU(       a  Ub  US   nUS   nGOU(       aE  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nOUby  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      n[        R                  " US   U/SS9n[        R                  " US   U/SS9nODU R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nU R                  (       a  X4nXR                  -  SU R                  4nU R                  XU5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XR                  -  X4:w  a-  [        SXR                  -  X4 SUR                  5        35      eUbv  UR                  5       USX4:w  a"  [        S	USX4 SUR                  5        35      eUR                  XR                  X5      U-   nUR                  XR                  -  X5      n[        R                   R#                  USS9nUb  UR                  5       U R                  4:w  a*  [        S
U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  XR                  X5      -  nUR                  XR                  -  X5      nU(       a;  UR                  XR                  X5      nUR                  XR                  -  X5      nOSn[        R                   R%                  UU R$                  U R&                  S9n[        R                  " UU5      nUR                  5       XR                  -  XR                  4:w  a5  [        SXR                  XR                  4 SUR                  5        35      eUR                  XR                  XR                  5      nUR                  SS5      nUR)                  XU
5      nU R+                  U5      nUUU4$ )z#Input shape: Batch x Time x ChannelNr   r   r)   r   rY   z$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )rj   r   r   r   r   r   r+   r_   r   r~   r   rb   bmmr   r   r   
functionalsoftmaxr   r   reshaper   )r   r   r   r   r   r   r   r   r2   tgt_lenr}   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r    r1   TrOCRAttention.forward   s    .T9"/"4"4"6i {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? )7NNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK01>AAr"   )r   r}   r   r   r   r   r~   r   r   r   r   r   )NN        FTF)NNNNF)r6   r7   r8   r9   r:   r;   r   rL   boolr   r+   r<   r   r   r1   r=   r>   r?   s   @r    r{   r{      sn   H #" #(C C 	C
 smC smC C C C !C CBeU\\ eC ec e 488<1526"'kB||kB #5<<0kB !u||!45	kB
 !.kB "%,,/kB  kB 
u||Xell3XeELL>Q5RR	SkB kBr"   r{   c                   4  ^  \ rS rSrS\4U 4S jjr        SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	\R                        S\\
   S\\
   4S jjrSrU =r$ )TrOCRDecoderLayeri#  r   c                 f  > [         TU ]  5         UR                  U l        [	        UU R                  UR
                  UR                  SS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        UR                   (       al  [	        UU R                  UR
                  UR"                  UR"                  UR                  SSS9U l        [        R                  " U R                  5      U l        [        R(                  " U R                  UR*                  5      U l        [        R(                  " UR*                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r}   r~   r   r   )r}   r~   r   r   r   r   r   )r   r   hidden_sizer}   r{   decoder_attention_headsattention_dropout	self_attnr   r	   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normr   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr   decoder_ffn_dimfc1fc2final_layer_normr   r   r   s     r    r   TrOCRDecoderLayer.__init__$  s7   ++'nn44,,
 ~~#F$>$>?"(";";$&LL$@! ... 88777700#'	!D ,.<<+GD(99T^^V-C-CD99V33T^^D "T^^ <r"   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cachec
           	      T   Un
Ub  USS OSnU R                  UUUUUS9u  pn[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnSnUbn  Un
Ub  USS OSnU R                  UUUUUUS9u  pn[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nX-   nUn
U R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  UX4-  nU	(       a  UU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size *(decoder_attention_heads,)*.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
Nr   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valueoutputss                     r    r1   TrOCRDecoderLayer.forwardF  s   < ! :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;*; --m||VZVcVc-d 011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM/K MM11-<<Z^ZgZg1hM$4M 88GM !2 P !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< ")>>G)++Gr"   )r   r   r   r}   r   r   r   r   r   r   r   )NNNNNNFT)r6   r7   r8   r9   r   r   r+   r<   r   r   r   r1   r=   r>   r?   s   @r    r   r   #  s     ={  =J 268<9=26=A8<,1$([||[ !.[  (5	[
 !) 6[ "%,,/[ %-U\\$:[ !u||!45[ $D>[ D>[ [r"   r   c                   ,    \ rS rSr\rSrSrS/rS r	Sr
g)TrOCRPreTrainedModeli  modelTr   c                 B   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_std
isinstancer   r   Conv1dr.   datanormal_r   zero_	EmbeddingrC   )r   moduler   s      r    _init_weights"TrOCRPreTrainedModel._init_weights  s    kk""fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r"    N)r6   r7   r8   r9   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r=   r   r"   r    r   r     s!    L&*#,-	?r"   r   c                   b   ^  \ rS rSrSrS\4U 4S jjrS rS r            S	S jr	Sr
U =r$ )
TrOCRDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

Args:
    config: TrOCRConfig
r   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        UR                  (       a&  [        UR                   UR                  5      U l        O@[%        UR                   U R                  -   S-   UR                  U R                  5      U l        UR&                  (       a&  [(        R*                  " UR                  5      U l        OS U l        [(        R,                  " [/        UR0                  5       Vs/ s H  n[3        U5      PM     sn5      U l        SU l        U R9                  5         g s  snf )NrK   rJ   r   F)r   r   r   decoder_layerdrop	layerdroppad_token_idrC   scale_embeddingr[   sqrtr   rA   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrN   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   layersgradient_checkpointing	post_init)r   r   rD   _r   s       r    r   TrOCRDecoder.__init__  sQ    ~~11!..7=7M7Mdii 2 23SV4v1143C3CQ\
 11#B6CaCacicucu#vD #E..1A1AAAE""  $D  %%')||F4F4F'GD$'+D$mmfNcNcHd$eHd1%6v%>Hd$ef&+#	 %fs   F;c                     U R                   $ rF   r   r   s    r    get_input_embeddings!TrOCRDecoder.get_input_embeddings  s       r"   c                     Xl         g rF   r  r   values     r    set_input_embeddings!TrOCRDecoder.set_input_embeddings  s    !r"   c                 8   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUb"  UnUR                  SUR                  S   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eUb  US   S   R                  S   OSnUc  U R                  U5      nU R                   R                  (       a  U R                  XS9nOU R                  XS9nUU-   nU R                  b  U R                  U5      n[        R                  R                  UU R                  U R                   S9nUR                  n[#        X.X5      nUb  Ub  [%        XHR&                  US   S	9nU R(                  (       a/  U R                   (       a  U	(       a  [*        R-                  S
5        Sn	U(       a  SOSnU
(       a  SOSnU
(       a  Ub  SOSnU	(       a  SOSn[/        XV/SS/5       Hn  u  nnUc  M  UR                  5       S   [1        U R2                  5      :w  d  M7  [        SU S[1        U R2                  5       SUR                  5       S    S35      e   [5        U R2                  5       GH  u  nnU(       a  UU4-  nU R                   (       a(  [6        R8                  " / 5      nUU R:                  :  a  MM  Ub  UU   OSnU R(                  (       aF  U R                   (       a5  U R=                  UR>                  UUUUUb  UU   OSUb  UU   OSSU
U	5
      nOU" UUUUUb  UU   OSUb  UU   OSUU
U	S9	nUS   nU	(       a  UUU
(       a  SOS   4-  nU
(       d  M  UUS   4-  nUc  GM  UUS   4-  nGM     U(       a  UU4-  nU	(       a  UOSnU(       d  [A        S UUUUU4 5       5      $ [C        UUUUUS9$ )aa  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
        on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
        shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer)   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   )r$   r   )r   z^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...Fr   	head_maskcross_attn_head_maskzThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frF   r   ).0vs     r    	<genexpr>'TrOCRDecoder.forward.<locals>.<genexpr>  s      mA ms   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)"r   r   output_hidden_statesr   use_return_dictr   rb   r*   rj   r   r   r   r   r   r   r   r   r   r   r'   r   loggerwarning_onceziplenr   	enumerater+   randr   _gradient_checkpointing_func__call__tupler   )r   r#   r   r   r   r  r  r  inputs_embedsr   r   r  return_dictinputinput_shaper$   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilityr   layer_outputs
next_caches                                 r    r1   TrOCRDecoder.forward  s|   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E!r5;;r?;I&',,.s3K!!Q(+Edee DSC^!3A!6!<!<Q!?de  --i8M;;66,,U,bI,,Y,fI%	1##/ 44]CM--mt||VZVcVc-dkk:

 !,1G1S%?&(;(;[QS_&" &&4==##t "	 #7BD0d&7<Q<]rdh#,R$ %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU"  =#3"55(4(]1-=,??(_ #9d  -!11+4'$
 '5FXlm  
 9+&+%1
 	
r"   )r   r   r   r   r   r   r   rC   )NNNNNNNNNNNN)r6   r7   r8   r9   r:   r   r   r  r
  r1   r=   r>   r?   s   @r    r   r     sP    { >!"
 "#!!Y
 Y
r"   r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TrOCRDecoderWrapperi  c                 D   > [         TU ]  U5        [        U5      U l        g rF   )r   r   r   decoderr   s     r    r   TrOCRDecoderWrapper.__init__  s     #F+r"   c                 &    U R                   " U0 UD6$ rF   r9  )r   argskwargss      r    r1   TrOCRDecoderWrapper.forward  s    ||T,V,,r"   r<  )r6   r7   r8   r9   r   r1   r=   r>   r?   s   @r    r7  r7    s    ,- -r"   r7  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c                      ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\             SS
\\R                     S\\R                      S\\R"                     S\\R                     S\\R                      S\\R                      S\\\\R"                           S\\R"                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       r\S 5       rSrU =r$ )TrOCRForCausalLMi  zoutput_projection.weightc                 
  > [         R                  " U5      nSUl        SUl        [        TU ]  U5        [        U5      U l        [        R                  " UR                  UR                  SS9U l        U R                  5         g )NTFr   )copydeepcopyr   is_encoder_decoderr   r   r7  r   r   r   r   r   output_projectionr   r   s     r    r   TrOCRForCausalLM.__init__  sh    v& $)! (0
!#6+=+=v?P?PW\!] 	r"   c                 B    U R                   R                  R                  $ rF   r   r9  r   r  s    r    r  %TrOCRForCausalLM.get_input_embeddings  s    zz!!...r"   c                 8    XR                   R                  l        g rF   rI  r  s     r    r
  %TrOCRForCausalLM.set_input_embeddings  s    */

'r"   c                     U R                   $ rF   rF  r  s    r    get_output_embeddings&TrOCRForCausalLM.get_output_embeddings  s    %%%r"   c                     Xl         g rF   rN  )r   new_embeddingss     r    set_output_embeddings&TrOCRForCausalLM.set_output_embeddings  s    !/r"   c                 $    XR                   l        g rF   r   r9  )r   r9  s     r    set_decoderTrOCRForCausalLM.set_decoder  s    $

r"   c                 .    U R                   R                  $ rF   rV  r  s    r    get_decoderTrOCRForCausalLM.get_decoder  s    zz!!!r"   r#   r   r   r   r  r  r  r$  labelsr   r   r  r%  r   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUS9nU R                  US   5      nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a	  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import (
...     TrOCRConfig,
...     TrOCRProcessor,
...     TrOCRForCausalLM,
...     ViTConfig,
...     ViTModel,
...     VisionEncoderDecoderModel,
... )
>>> import requests
>>> from PIL import Image

>>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
>>> # init vision2text model with random weights
>>> encoder = ViTModel(ViTConfig())
>>> decoder = TrOCRForCausalLM(TrOCRConfig())
>>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

>>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

>>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> pixel_values = processor(image, return_tensors="pt").pixel_values
>>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

>>> # training
>>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
>>> model.config.pad_token_id = processor.tokenizer.pad_token_id
>>> model.config.vocab_size = model.config.decoder.vocab_size

>>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
>>> outputs = model(pixel_values, labels=labels)
>>> loss = outputs.loss
>>> round(loss.item(), 2)
5.30

>>> # inference
>>> generated_ids = model.generate(pixel_values)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> generated_text
'industry, " Mr. Brown commented icily. " Let us have a'
```N)r#   r   r   r   r  r  r  r$  r   r   r  r%  r   r)   r   )losslogitsr  r   r  r  )r   r   r  r  r   r9  rF  r   rb   r   r   r  r   r  r  )r   r#   r   r   r   r  r  r  r$  r\  r   r   r  r%  r   r_  r^  loss_fctoutputs                      r    r1   TrOCRForCausalLM.forward  sH   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 ''
3')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r"   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7fr5   )rl   rd   r(   )r  
past_statebeam_idxs     r    r  2TrOCRForCausalLM._reorder_cache.<locals>.<genexpr>q  s1     ncmU_--aZ=N=N1OPPcms   7:)r#  )r  rf  reordered_past
layer_pasts    `  r    _reorder_cacheTrOCRForCausalLM._reorder_cachel  s8    )Jncmnn N * r"   )r   rF  )NNNNNNNNNNNNN)r6   r7   r8   r9   _tied_weights_keysr   r  r
  rO  rS  rW  rZ  r   r   r+   
LongTensorr<   rV   r   r   r   r   r1   rx   rj  r=   r>   r?   s   @r    rA  rA    s    55
/0&0%"  1515=A=A,07;EI59-1$(,0/3&*s
E,,-s
 !.s
  ((9(9:	s

 !))9)9 :s
 ELL)s
 'u||4s
 "%e.?.?(@"ABs
   1 12s
 ))*s
 D>s
 $D>s
 'tns
 d^s
 
u77	8s
 s
j  r"   rA  )+r:   rC  r[   typingr   r   r   r+   r   torch.nnr   activationsr	   
generationr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_trocrr   
get_loggerr6   r  r   r   rA   ModulerN   r{   r   r   r   r7  rA  __all__r   r"   r    <module>rz     s.   6   ) )   % ! ) m - , , 
		H	%;bll ;4
=r|| 
=;8 ;8|RBRYY RBj~		 ~B ?? ? ?$F
' F
R -. -- 
^+_ ^
^B 5
6r"   