
    fTh                        S r SSKrSSKJrJrJrJrJrJr  SSK	r	SSK	J
r
Jr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJrJr  SSKJ r   \RB                  " \"5      r# S r$S.S jr%SS\	RL                  4S jr'\ " S S\5      5       r(S r)S r*S r+S/S jr, " S S\RZ                  5      r. " S S\RZ                  5      r/ " S S\RZ                  5      r0 " S S\RZ                  5      r1S  r2 " S! S"\RZ                  5      r3S# r4S$ r5\ " S% S&\(5      5       r6\" S'S(9 " S) S*\(\5      5       r7 " S+ S,\Rp                  5      r9/ S-Qr:g)0z`PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19    N)AnyDictListOptionalTupleUnion)Tensornn)CrossEntropyLoss	LayerNorm   )ACT2FN)GenerationMixin)is_deepspeed_zero3_enabled)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringlogging   )
FSMTConfigc                 P    U R                  5       S:X  d   eU R                  S5      $ )z+Turns 1->0, 0->1, False->True, True-> False   r   )dimeq)attention_masks    ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/fsmt/modeling_fsmt.pyinvert_maskr       s*    1$$$Q    c                     U R                   S   n[        R                  " X R                  S9nUR	                  X"5      nUR                  S5      nU(       a  X1-   nXC:  nU R                  US:H  S5      $ )Nr   device)shapetorcharanger$   expand	unsqueezemasked_fill)xdiagonallr(   masks        r   	triu_onnxr0      sf    	
A\\!HH-F==Db!F">D==A&&r!   c           
         U R                   nUc  [        X5      nUR                  5       u  pgUc  [        X%5      nO[	        U5      n[        [        [        R                  " XwUS95      S5      R                  UR                  S9nX#U4$ )z
Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
generation
dtyper   r#   )pad_token_idshift_tokens_rightsizemake_padding_maskr    r0   fill_with_neg_infr'   zerostor$   )	config	input_idsdecoder_input_idsdecoder_padding_maskcausal_mask_dtyper4   bsztgt_lencausal_masks	            r   _prepare_fsmt_decoder_inputsrC      s     &&L .yG$))+LC#01BQ*+?@-ekk'Rc.deghill '' m K K??r!   c                   2    \ rS rSr\rSrS r\S 5       r	Sr
g)PretrainedFSMTModel   modelc                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        5      (       a^  UR                  " / UR
                  R                  QUR                  P76 n[        R                  " USS9nUR                  5         X1l        g [        U[        R                   5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )N        )meanstdF)requires_grad)r;   init_std
isinstancer
   Linearweightdatanormal_biaszero_SinusoidalPositionalEmbeddingget_embeddingr&   padding_idx	Parameterdetach_	Embedding)selfmodulerK   rP   s       r   _init_weights!PretrainedFSMTModel._init_weights   s   kk""fbii((MM&&CS&9{{&  &&( ' =>>))S6==+>+>S@R@RSF\\&>FNN"M--MM&&CS&9!!-""6#5#56<<> . .r!   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      r   r         r   r#   )r   r<   )r;   r4   r'   tensorr$   ne)r[   	pad_tokenr<   dummy_inputss       r   rh    PretrainedFSMTModel.dummy_inputs   sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r!    N)__name__
__module____qualname____firstlineno__r   config_classbase_model_prefixr]   propertyrh   __static_attributes__rj   r!   r   rE   rE      s&    L?   r!   rE   c                     U R                   R                  u  p[        R                  " XSS9nU R                   R                  UR                   l        U$ )NFrS   )rP   r&   r
   rO   rQ   )emb
vocab_sizeemb_size	lin_layers       r   _make_linear_from_embry      s@    ::++J		*U;IJJOOIr!   c                 0    X:w  a  [        SU  SU 35      eg )Nzshape mismatch: z != )AssertionError)shape_1shape2s     r   _check_shapesr~     s&    /yVHEFF r!   c                 &   U R                  U S:H  U5        U R                  5       nU R                  U5      R                  SS9S-
  R	                  S5      nU R                  SU5      R                  5       USS2S4'   U SS2SS24   USS2SS24'   U$ )zXShift input ids one token to the right, and wrap the last non pad token (usually <eos>).ir   r   r%   Nr   )masked_fill_clonerf   sumr*   gathersqueeze)r<   r4   prev_output_tokensindex_of_eoss       r   r5   r5     s     9,l;"*LL.22q29A=HHLL(//<@HHJq!t )!SbS& 1q!"ur!   c                 V    U R                  U5      nUR                  5       (       d  SnU$ )zTrue for pad tokensN)r   any)r<   rW   padding_masks      r   r7   r7     s*    <<,Lr!   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )EncoderLayeri  r;   c                 <  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        [        U R                  5      U l	        UR                  U l
        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        U R                  5      U l        g )N)dropout)super__init__d_model	embed_dim	Attentionencoder_attention_headsattention_dropout	self_attnr   self_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr
   rO   encoder_ffn_dimfc1fc2final_layer_normr[   r;   	__class__s     r   r   EncoderLayer.__init__  s    "4>>63Q3Q[a[s[st$-dnn$=!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D )$.. 9r!   c                    UnU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nX4$ )a  
Args:
    x (`torch.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
    encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
        *(batch, src_len)* where padding elements are indicated by `1`.
    for t_tgt, t_src is excluded (or masked out), =0 means it is
    included in attention
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        *(config.encoder_attention_heads,)*.

Returns:
    encoded output of shape *(seq_len, batch, embed_dim)*
)querykeykey_padding_masklayer_head_maskoutput_attentionsptraining)r   r
   
functionalr   r   r   r   r   r   r   r   )r[   r,   encoder_padding_maskr   r   residualattn_weightss          r   forwardEncoderLayer.forward+  s     ..1+/ ) 
 MM!!!||dmm!LL%%a(txx{+MM!!!'>'>!WHHQKMM!!!||dmm!LL!!!$r!   )	r   r   r   r   r   r   r   r   r   )F	rk   rl   rm   rn   r   r   r   rr   __classcell__r   s   @r   r   r     s    
:z 
:! !r!   r   c                      ^  \ rS rSrSrS\4U 4S jjr      SS\R                  S\	\R                     S\	\R                     S\	\R                     S	\
S
\
S\
4S jjrSrU =r$ )FSMTEncoderiO  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].

Args:
    config: FSMTConfig
r;   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        X l        UR                  nUR                  (       a  [        R                  " U5      OSU l        [        UR                  U R
                  -   S-   X0R
                  5      U l        [        R                   " [#        UR$                  5       Vs/ s H  n['        U5      PM     sn5      U l        g s  snf )N      ?r   )r   r   r   encoder_layerdrop	layerdroprW   embed_tokensembedding_dimscale_embeddingmathsqrtembed_scalerU   max_position_embeddingsembed_positionsr
   
ModuleListrangeencoder_layersr   layers)r[   r;   r   r   _r   s        r   r   FSMTEncoder.__init__W  s    ~~11'33( ..	393I3I499Y/s<**T-=-==A9N^N^ 
 mm5I^I^C_$`C_a\&%9C_$`a$`s   Dr<   r   inputs_embeds	head_maskr   output_hidden_statesreturn_dictc                    Ub  [        U5      nUb  Ub  [        S5      eUb0  U R                  U5      U R                  -  nU R	                  U5      nOwUbi  X0R                  -  nUSS2SS2S4   R                  USS2SS2S4   R                  S5      U R                  R                  5      n	U R	                  U	5      nO[        S5      eX8-   n
[        R                  R                  XR                  U R                  S9n
U
R                  SS5      n
U(       a  SOSnU(       a  SOSnUb\  UR                  5       S   [        U R                  5      :X  d2   S[        U R                  5       S	UR                  5       S    S
35       e[!        U R                  5       H  u  pU(       a)  U
R                  SS5      n
X4-  nU
R                  SS5      n
["        R$                  " / 5      nU R                  (       a  XR&                  :  a  SnOU" U
UUb  XM   OSUS9u  n
nU(       d  M  UU4-   nM     U
R                  SS5      n
U(       a  X4-  nU(       d  [)        S XU4 5       5      $ [+        XUS9$ )aL  
Args:
    input_ids (`torch.LongTensor`): tokens in the source language of shape
        *(batch, src_len)*
    attention_mask (`torch.LongTensor`): indicating which indices are padding tokens
    inputs_embeds (`torch.FloatTensor`):
        embedding vectors of shape *(batch, src_len, embed_dim)*
    head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

Returns:
    BaseModelOutput or Tuple comprised of:

        - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
        - **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
          batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
        - **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
        During training might not be of length n_layers because of layer dropout.
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   rj   z&The head_mask should be specified for  layers, but it is for .)r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fNrj   .0vs     r   	<genexpr>&FSMTEncoder.forward.<locals>.<genexpr>  s     Y$Gq$G   	last_hidden_statehidden_states
attentions)r    
ValueErrorr   r   r   r+   r   rW   r
   r   r   r   	transposer6   lenr   	enumerater'   randr   tupler   )r[   r<   r   r   r   r   r   r   	embed_posposition_idsr,   encoder_statesall_attentionsidxencoder_layerdropout_probabilityattns                    r   r   FSMTEncoder.forwardd  sf   B %(8N ]%>cdd" --i84;K;KKM,,Y7I&),<,<<M )Aq1==aAg&))!,d.B.B.N.NL ,,\:ITUU%MM!!!||dmm!L KK130d >>#A&3t{{+;< 8T[[9I8JJabkbpbpbrstbuavvwx< #,DKK"8C#KK1%$&KK1%"'**R.}}"5"F'"7@7LY^RV&7	4 ! !/4'!9% #9* KK1d"NYQ$GYYY]kllr!   )r   r   r   r   r   r   rW   )NNNFFT)rk   rl   rm   rn   __doc__r   r   r'   r	   r   boolr   rr   r   r   s   @r   r   r   O  s    bz b  2604,0"'%* _m<<_m !._m  -	_m
 ELL)_m  _m #_m _m _mr!   r   c                   H   ^  \ rS rSrS\4U 4S jjr       SS jrSrU =r$ )DecoderLayeri  r;   c                   > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        UR                  U l        [        UR                     U l        UR                  U l        [        U R                  5      U l        [	        U R                  UR
                  UR                  SS9U l        [        U R                  5      U l        ["        R$                  " U R                  UR&                  5      U l        ["        R$                  " UR&                  U R                  5      U l        [        U R                  5      U l        g )N)r   	num_headsr   T)r   encoder_decoder_attention)r   r   r   r   r   decoder_attention_headsr   r   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr
   rO   decoder_ffn_dimr   r   r   r   s     r   r   DecoderLayer.__init__  s   "nn44,,

 ~~#F$>$>?"(";";$-dnn$=!%NN**,,&*	
 (1'@$99T^^V-C-CD99V33T^^D )$.. 9r!   c
           
      N   Un
Uc  0 nU R                  UUUUUUU	S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUn
U R                  R                  U R                   R                  :w  d   eU R                  UUUUUU	S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUn
U R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUUUU4$ )N)r   r   layer_stater   	attn_maskr   r   r   )r   r   r   r   r   r   )r   r
   r   r   r   r   r   	cache_keyr   r   r   r   r   r   )r[   r,   encoder_hidden_statesencoder_attn_maskr   rB   r   cross_attn_layer_head_maskr>   r   r   self_attn_weightscross_attn_weightss                r   r   DecoderLayer.forward  s    K  $~~#1!+/  .  
 MM!!!||dmm!LL%%a(   **dnn.F.FFFF $ 1 1%.#6/ !2 !
 MM!!!||dmm!LL((+ txx{+MM!!!'>'>!WHHQKMM!!!||dmm!LL!!!$	
 	
r!   )r   r   r   r   r   r   r   r   r   r   r   )NNNNNNFr   r   s   @r   r   r     s0    :z :: #'!;
 ;
r!   r   c                   v  ^  \ rS rSrSrS\S\R                  4U 4S jjrS r	        SS\
R                  S\
R                  S	\
R                  S
\
R                  S\
R                  S\\
R                     S\\
R                     S\\
R                     S\\\
R                        S\S\S\S\4S jjrSrU =r$ )FSMTDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]

Args:
    config: FSMTConfig
    embed_tokens (nn.Embedding): output embedding
r;   r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  (       a   [        R                  " UR                  5      OSU l
        X l        UR                  n[        UR                  U R
                  -   S-   X0R
                  5      U l        [         R"                  " [%        UR&                  5       Vs/ s H  n[)        U5      PM     sn5      U l        [-        5       (       a\  SS KnUR0                  R3                  U R                  R4                  S S9   U R                  R4                  R6                  nS S S 5        O U R                  R4                  R6                  n[         R8                  " WS   US   SS9U l        U R                  R4                  U R:                  l        g s  snf ! , (       d  f       NZ= f)Nr   r   r   )modifier_rankFrt   )r   r   r   decoder_layerdropr   rW   r   r   r   r   r   r   r   rU   r   r   r
   r   r   decoder_layersr   r   r   	deepspeedzeroGatheredParametersrP   r&   rO   output_projection)r[   r;   r   r   r   r  embed_tokens_weight_shaper   s          r   r   FSMTDecoder.__init__'  s~   ~~11'338>8N8N499V^^4TW( ..	<**T-=-==A9N^N^ 
 mm5I^I^C_$`C_a\&%9C_$`a%''2243D3D3K3K[_2`,0,=,=,D,D,J,J) a` )-(9(9(@(@(F(F%!#+DQ+GIbcdIelq!r(,(9(9(@(@% %a
 a`s   )G!G!!
G/c                 N    U R                   R                  U R                  l        g r   )r	  rP   r   r[   s    r   _tie_weightsFSMTDecoder._tie_weights>  s    #'#9#9#@#@ r!   r<   r   r   r>   decoder_causal_maskr   r   cross_attn_head_maskpast_key_values	use_cacher   r   r   c                    Ub  [        U5      nUb  Ub  [        S5      eUbM  U R                  U5      nU
(       a  USS2SS24   nUSS2SS24   nU R                  U5      U R                  -  nOwUbi  USS2SS2S4   R                  USS2SS2S4   R                  S5      U R                  R                  5      nU R                  U5      nXpR                  -  nO[        S5      eX-  n[        R                  R                  XR                  U R                  S9nUR                  SS5      nUR                  SS5      nU(       a  SOSnU(       a  SOSnU(       a  SOSn/ n[        Xh/S	S
/5       Hj  u  nnUc  M  UR                  5       S   [        U R                   5      :X  a  M7   SU S[        U R                   5       SUR                  5       S    S35       e   [#        U R                   5       H  u  nnU(       a*  UR                  SS5      nUU4-  nUR                  SS5      nU R                  (       a(  [$        R&                  " / 5      nUU R(                  :  a  Mp  U	b  U	U   OSnU" UUUUUUUb  UU   OSUb  UU   OSUS9	u  nnnnU
(       a  UR+                  UR-                  5       5        U(       d  M  UU4-  nUU4-  nM     U(       a*  UR                  SS5      nUU4-  nUR                  SS5      nUR                  SS5      nUR                  SS5      nU R/                  U5      nU
(       a  UOSnU(       d  [1        S UUUUU4 5       5      $ [3        UUUUUS9$ )a  
Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
EMNLP 2019).

Args:
    input_ids (`torch.LongTensor` of shape `(batch, tgt_len)`):
        previous decoder outputs for teacher forcing
    encoder_hidden_states: output from the encoder, used for
        encoder-side attention
    encoder_padding_mask: for ignoring pad tokens
    past_key_values (dict or None): dictionary used for storing state during generation
    head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

Returns:
    BaseModelOutputWithPast or tuple:

        - the decoder's features of shape *(batch, tgt_len, embed_dim)*
        - the cache
        - hidden states
        - attentions
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer%   r   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   rj   r   r  zThe `z` should be specified for r   r   )r   r>   r   rB   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rj   r   s     r   r   &FSMTDecoder.forward.<locals>.<genexpr>  s      ^a^r   )r   r  r   r   cross_attentions)r    r   r   r   r   r+   r   rW   r
   r   r   r   r   zipr6   r   r   r   r'   r   r   appendcopyr	  r   r   )r[   r<   r   r   r>   r  r   r   r  r  r  r   r   r   	positionsr,   r   all_hidden_statesall_self_attnsall_cross_attnsnext_decoder_cacher   	mask_namer   decoder_layerr   r   layer_self_attn
layer_pastlayer_cross_attn
next_caches                                  r   r   FSMTDecoder.forwardA  s   `  +#./C#D  ]%>stt",,Y7I%af-	%af-	!!),t/?/??A& )Aq1==aAg&))!,d.B.B.N.NL ,,\:I 0 00Adee	MM!!!||dmm!L KK1 5 ? ?1 E #7BD0d 1"t %((IKYoKp$q Iy$ ~~'*s4;;/?@ I;&@T[[AQ@R S!(+,A/@ %r #,DKK"8C#KK1%!aT)!KK1%}}&+jjn#&72A2M/#.SWK?L%"6%9'/3<3H3dI]Ii,@,Eos"3
@<A
,< "))*//*;<  ?"44$4#66= #9B  Aq!A!%Aq!A KK1 5 ? ?1 E""1%+4'$
 z+<no^   9&+%,
 	
r!   )r   r   r   r   r   r   r	  rW   )NNNNFFFT)rk   rl   rm   rn   r   r   r
   rZ   r   r  r'   r	   r   r   FloatTensorr   r   rr   r   r   s   @r   r  r    s   Az A A.A -1047;=A"'%* S
<<S
  %||S
 $ll	S

 $llS
 #\\S
 ELL)S
  -S
 'u||4S
 "$u'8'8"9:S
 S
  S
 #S
 S
 S
r!   r  c                 j    U R                  5        H  u  p#Uc  M
  UR                  SU5      X'   M      U $ )Nr   )itemsindex_select)
attn_cache	new_orderkinput_buffer_ks       r   _reorder_bufferr/    s:    '--/%*779EJM 0 r!   c                      ^  \ rS rSrSr   SU 4S jjrS r     SS\\   S\\   S\\	\
\\   4      S\\   S	\\   S
\\\\   4   4S jjrS rSrU =r$ )r   i  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :X  d   S5       eU R
                  S-  U l        XPl        [        R                  " XUS9U l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        U R                  (       a  SU l        g SU l        g )Nz(embed_dim must be divisible by num_headsg      rt   encoder_decoderr[   )r   r   r   r   r   head_dimscalingr   r
   rO   k_projv_projq_projout_projr   )r[   r   r   r   rS   r   r   s         r   r   Attention.__init__  s     	""!.}}y(DNN:f<ff:}}d*)B&ii	4@ii	4@ii	4@		)TB.2.L.L*RXr!   c                     UR                  5       R                  X#U R                  -  U R                  5      R	                  SS5      $ )Nr   r   )
contiguousviewr   r3  r   )r[   re   seq_lenr@   s       r   _shapeAttention._shape  s:      "''t~~1Et}}U__`acdeer!   r   r   r   r   r   returnc                 	   U R                   nUR                  5       u  pnXR                  :X  d   e[        UR                  5       5      XU/:X  d   eUb,  UR	                  U R
                  0 5      nSU;   a	  U(       a  SnOSn0 nU R                  U5      U R                  -  nU(       a*  Uc  S=pOEU R                  U5      nU R                  U5      nO"U R                  U5      nU R                  U5      nU R                  XU
5      nUb  U R                  USU
5      nUb  U R                  USU
5      nUb  U R                  XXX5      u  pnUR                  XR                  SU R                  5      UR                  XR                  SU R                  5      U(       d  UOSS.X@R
                  '   Uc   eUR                  S5      n[        R                   " XR#                  SS5      5      nUR                  5       XR                  -  U	U4:X  d   eUb?  UR                  XR                  U	U5      U-   nUR                  XR                  -  U	U5      nUb  UR%                  5       S:X  a  SnUb  UR                  5       SS U
U4:X  d   eUb  UR                  XR                  U	U5      nUR'                  S5      R'                  S5      nUR)                  U[        R*                  " UR,                  5      R.                  5      nUR                  XR                  -  U	U5      n[0        R2                  R5                  USS9nUb  UR                  5       U R                  4:X  d&   S	U R                  4 S
UR                  5        35       eUR                  SSSS5      UR                  XR                  U	U5      -  nUR                  XR                  -  U	U5      nU(       a=  UR                  XR                  U	U5      nUR                  XR                  -  U	U5      nOSn[0        R2                  R7                  UU R6                  U R8                  S9nUc   e[        R                   " UU5      nUR                  5       XR                  -  XR                  4:X  d   eUR#                  SS5      R;                  5       R                  XU5      nU R=                  U5      nUU4$ )z+Input shape: Time(SeqLen) x Batch x ChannelNprev_keyr%   )rB  
prev_valueprev_key_padding_maskr   r   r   r   z/Head mask for a single layer should be of size z	, but is r   )r   r6   r   listgetr   r7  r4  r5  r6  r>  _use_saved_stater<  r   r3  r'   bmmr   r   r*   r+   finfor3   minr
   r   softmaxr   r   r;  r8  )r[   r   r   r   r   r   r   r   	static_kvrA   r@   r   saved_stateqr-  r   src_lenr   reshapedattn_weights_reshaped
attn_probsattn_outputs                         r   r   Attention.forward  s    88	"'**,iNN***EJJL!gI%>>>>"%//$.."=K[(YKKKK-{AKK$KK$E"AE"AKKC(=Ar3'A=Ar3'A"%)%:%:1`i%o"A" sNNBF&&nnb$--H=F%5D'
NN# }}&&)yyKK1$56  "s^^';Wg&NNNN ',,S..'7SV__L',,S>>-A7GTL ',<,@,@,Ba,G#'+;+@+@+B2A+FK
 ,
 	
 

 '',,S..'7SL'11!4>>qAH'33Hekk,J\J\>]>a>abL',,S>>-A7GTL}},,\r,B&"'')dnn->> A4>>BSATT]^m^r^r^t]uv> +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL$0$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**ll]] + 

 }}ii
A.!cNN&:G]]%SSSS!++Aq1<<>CCGR[\mmK0111r!   c                    SU;   aX  US   nUc   eUR                  X`R                  -  SU R                  5      nU(       a  UnOUc   e[        R                  " X/SS9nSU;   aX  US   n	U	c   eU	R                  X`R                  -  SU R                  5      n
U(       a  U
nOUc   e[        R                  " X/SS9nUb  Uc   eUR                  SS 5      nUb!  U(       a  UnO[        R                  " X/SS9nOUnXU4$ )NrB  r%   r   r   rC  rD  )r<  r   r3  r'   catrF  )r[   r-  r   rM  r   rL  r@   	_prev_keyrB  _prev_valuerC  rD  new_key_padding_masks                r   rG  Attention._use_saved_stated  s   $#J/I((( ~~cNN&:BNH}$}IIxm3;&%l3K***$))#*>DMMRJ}$}IIzo15}..2=//BY[_2` ,'<$',yy2G1Z`a'b$#3 )))r!   )r   r   r   r   r3  r5  r   r8  r7  r4  r6  )rI   TF)NNNNF)rk   rl   rm   rn   r   r   r>  r   r	   r   strr   r   rG  rr   r   r   s   @r   r   r     s    G "'Y.f .2=A&*,0g2 fg2 #6*	g2
 d3(8#89:g2 F#g2 "&)g2 
vx''	(g2R* *r!   r   c                     U R                  5       R                  [        R                  " U R                  5      R
                  5      R                  U 5      $ )z:FP16-compatible function that fills a input_ids with -inf.)floatfill_r'   rI  r3   rJ  type_asts    r   r8   r8     s5    779??5;;qww/334<<Q??r!   c                     [        U SS 5      $ )Nr&   )getattrr`  s    r   
_get_shaperd    s    1gt$$r!   c            $       6  ^  \ rS rSrSS/rS\4U 4S jjrS rS rS r	\
              SS	\R                  S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\R"                        S\\\R"                        S\\   S\\   S\\   S\\R"                     S\\R"                     S\\   S\\\R                     \4   4 S jj5       rS rS rS rS rSrU =r$ ) 	FSMTModeli  decoder.embed_tokens.weight decoder.output_projection.weightr;   c                 L  > [         TU ]  U5        UR                  n[        R                  " UR
                  UR                  U5      n[        R                  " UR                  UR                  U5      n[        X5      U l	        [        X5      U l        U R                  5         g r   )r   r   r4   r
   rZ   src_vocab_sizer   tgt_vocab_sizer   encoderr  decoder	post_init)r[   r;   rW   encoder_embed_tokensdecoder_embed_tokensr   s        r   r   FSMTModel.__init__  s{     ))!||F,A,A6>>S^_!||F,A,A6>>S^_"6@"6@ 	r!   c                     U R                   $ r   )rl  r  s    r   get_encoderFSMTModel.get_encoder      ||r!   c                     U R                   $ r   )rm  r  s    r   get_decoderFSMTModel.get_decoder  ru  r!   c                    U R                   R                  (       ai  U R                  U R                  R                  U R                  5       5        U R                  U R                  R                  U R                  5       5        g g r   )r;   tie_word_embeddings_tie_or_clone_weightsrm  r   get_input_embeddingsr	  r  s    r   r  FSMTModel._tie_weights  sZ    ;;**&&t||'@'@$B[B[B]^&&t||'E'EtG`G`Gbc +r!   r<   r   r=   decoder_attention_maskr   decoder_head_maskr  encoder_outputsr  r  r   r   r   decoder_inputs_embedsr   r@  c                    Uc  Sn
Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU
(       dG  UbD  [        U R                   UUUU R                  R                  R                  R                  S9u  nnnOSu  nnUc  Uc  [        S5      eUc  U R                  UUUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S	:  a  US	   OSS
9nU R                  UUS   UUUUUUU	U
UUUS9nU(       d  UU-   $ [        UR                   UR"                  UR$                  UR&                  UR(                  UR                   UR$                  UR&                  S9$ )as  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NF)r=   r>   r?   NNzIMake sure that `decoder_input_ids` or `decoder_inputs_embeds` are passed.)r<   r   r   r   r   r   r   r   r   r   r   )	r  r   r   r  r  r  r   r   r   )r   r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)r;   r   r   r  use_return_dictrC   rm  r   rP   r3   r   rl  rN   r   r   r   r   r  r   r   r  )r[   r<   r   r=   r~  r   r  r  r  r  r  r   r   r   r  r   r>   rB   decoder_outputss                      r   r   FSMTModel.forward  s   P $I1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] Y2C_"3%;"&,,";";"B"B"H"HD@3[ 1;- +$)>)Fhii""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,A  +/'!5+/!5# ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r!   c                 .    U R                   R                  $ r   rl  r   r  s    r   r|  FSMTModel.get_input_embeddings      ||(((r!   c                 $    XR                   l        g r   r  r[   values     r   set_input_embeddingsFSMTModel.set_input_embeddings      $)!r!   c                 .    U R                   R                  $ r   rm  r   r  s    r   get_output_embeddingsFSMTModel.get_output_embeddings"  r  r!   c                 $    XR                   l        g r   r  r  s     r   set_output_embeddingsFSMTModel.set_output_embeddings%  r  r!   )rm  rl  )NNNNNNNNNNNNNN)rk   rl   rm   rn   _tied_weights_keysr   r   rs  rw  r  r   r'   
LongTensorr   r	   
BoolTensorr   r'  r   r   r   r   r|  r  r  r  rr   r   r   s   @r   rf  rf    s   79[\z d
  268<=A,0487;>B>B$(,0/359=A&*!o
##o
 !.o
 $E$4$45	o

 !))9)9 :o
 ELL)o
 $ELL1o
 'u||4o
 "%(9(9":;o
 "%(9(9":;o
 D>o
 $D>o
 'tno
   1 12o
  ((9(9:o
  d^!o
" 
uU\\"$66	7#o
 o
b)*)* *r!   rf  zV
    The FSMT Model with a language modeling head. Can be used for summarization.
    )custom_introc            &         ^  \ rS rSrSrSS/rS\4U 4S jjr\                S S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\
R                     \4   4"S jj5       rS\
R                  4S jr\S 5       rS rS rS rS rSrU =r$ )!FSMTForConditionalGenerationi)  rG   rg  rh  r;   c                 f   > [         TU ]  U5        [        U5      nX l        U R	                  5         g r   )r   r   rf  rG   rn  )r[   r;   
base_modelr   s      r   r   %FSMTForConditionalGeneration.__init__2  s+     v&

 	r!   r<   r   r=   r~  r   r  r  r  r  r   r  labelsr  r   r   r   r@  c                     Ub  UOU R                   R                  nUb  SnU R                  UU
UUUUUUUUU	UUUUS9nUS   nSnUbF  [        5       nU" UR	                  SU R                   R
                  5      UR	                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )uO  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Translation:

```python
>>> from transformers import AutoTokenizer, FSMTForConditionalGeneration

>>> mname = "facebook/wmt19-ru-en"
>>> model = FSMTForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = AutoTokenizer.from_pretrained(mname)

>>> src_text = "Машинное обучение - это здорово, не так ли?"
>>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
"Machine learning is great, isn't it?"
```
NF)r   r   r=   r  r  r~  r   r  r  r  r  r   r   r   r   r%   r   )	losslogitsr  r  r  r  r  r   r  )r;   r  rG   r   r<  rk  r   r  r  r  r  r  r   r  )r[   r<   r   r=   r~  r   r  r  r  r  r   r  r  r  r   r   r   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                         r   r   $FSMTForConditionalGeneration.forward:  s2   z &1%<k$++B]B]I**')/"7+#9/!5+/!5#  
" AJ	')H%innR9S9S&TV\VaVabdVefN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r!   c                 @    [        XR                  R                  5      $ r   )r5   r;   r4   )r[   r  s     r   %prepare_decoder_input_ids_from_labelsBFSMTForConditionalGeneration.prepare_decoder_input_ids_from_labels  s    !&++*B*BCCr!   c           
          / nU  HB  nUR                  5        VVs0 s H  u  pEU[        XQ5      _M     nnnUR                  U5        MD     U$ s  snnf r   )r)  r/  r  )r  beam_idxreordered_pastr#  attn_keyr+  layer_past_news          r   _reorder_cache+FSMTForConditionalGeneration._reorder_cache  se    )J ]g\l\l\n\nDXH/*??\n   !!.1 * 	s   Ac                 .    U R                   R                  $ r   )rG   rl  r  s    r   rs  (FSMTForConditionalGeneration.get_encoder      zz!!!r!   c                 .    U R                   R                  $ r   )rG   rm  r  s    r   rw  (FSMTForConditionalGeneration.get_decoder  r  r!   c                 B    U R                   R                  R                  $ r   rG   rm  r   r  s    r   r  2FSMTForConditionalGeneration.get_output_embeddings  s    zz!!...r!   c                 8    XR                   R                  l        g r   r  r  s     r   r  2FSMTForConditionalGeneration.set_output_embeddings  s    */

'r!   )rG   )NNNNNNNNNNNNNNNN)rk   rl   rm   rn   rp   r  r   r   r   r   r'   r  r	   r  r   r'  r   r   r   r   r  staticmethodr  rs  rw  r  r  rr   r   r   s   @r   r  r  )  s	     79[\z   15158<=A,0487;>B>B048<-1$(,0/3&*#h
E,,-h
 !.h
 $E$4$45	h

 !))9)9 :h
 ELL)h
 $ELL1h
 'u||4h
 "%(9(9":;h
 "%(9(9":;h
  -h
  (5h
 ))*h
 D>h
 $D>h
  'tn!h
" d^#h
$ 
uU\\"O3	4%h
 h
TDELL D  ""/0 0r!   r  c                      ^  \ rS rSrSrU 4S jrS r\S 5       r\S\	4S j5       r
  SS\\   S	\\   4U 4S
 jjjrSrU =r$ )rU   i  a(  
This module produces sinusoidal positional embeddings of any length.

We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.

Padding symbols are ignored.

These embeddings get automatically extended in forward if more positions is needed.
c                 &   > [         TU ]  XU5        g r   )r   r   )r[   num_positionsr   rW   r   s       r   r   &SinusoidalPositionalEmbedding.__init__  s    {Cr!   c                 $   U R                  XU5      nUR                  U R                  R                  U R                  R                  S9n[
        R                  " U5      U l        U R                  R                  5         SU R                  l        g )N)r3   r$   F)	rV   r:   rP   r3   r$   r
   rX   rY   rL   )r[   r  r   rW   rP   s        r   make_weight)SinusoidalPositionalEmbedding.make_weight  sf    ##M+N!2!24;;;M;MNll6*$)!r!   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   U$ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r   i'  r   r2   r   r   r%   N)r   logr'   expr(   int64r]  r*   rV  sincosr<  r9   )num_embeddingsr   rW   half_dimru   s        r   rV   +SinusoidalPositionalEmbedding.get_embedding  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQ
r!   rW   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  R                  5       U-   $ )z
Replace non-padding symbols with their position numbers.

Position numbers begin at padding_idx+1. Padding symbols are ignored.
r   r   )rf   intr'   cumsumr_  long)re   rW   r/   s      r   make_positions,SinusoidalPositionalEmbedding.make_positions  sJ     yy%))+Tq)11$7$>DDFTTr!   incremental_statetimestepc                 &  > UR                   SS u  pEU R                  S-   U-   nX`R                  R                  S5      :  a&  U R	                  X`R
                  U R                  5        U R                  XR                  5      n[        TU ]!  U5      $ )z/Input is expected to be of size [bsz x seqlen].Nr   r   r   )	r&   rW   rP   r6   r  r   r  r   r   )	r[   inputr  r  r@   r=  max_posr  r   s	           r   r   %SinusoidalPositionalEmbedding.forward  s     {{2A""Q&0[[%%a((W&8&8$:J:JK''/?/?@	wy))r!   )rP   r  )rk   rl   rm   rn   r   r   r  r  rV   r  r  r   r   r	   r   rr   r   r   s   @r   rU   rU     st    D*  & UC U U  ,0%)	* $C=* 6"	* *r!   rU   )r  rf  rE   )r   )r   );r   r   typingr   r   r   r   r   r   r'   r	   r
   torch.nnr   r   activationsr   
generationr   integrations.deepspeedr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   configuration_fsmtr   
get_loggerrk   loggerr    r0   float32rC   rE   ry   r~   r5   r7   Moduler   r   r   r  r/  r   r8   rd  rf  r  rZ   rU   __all__rj   r!   r   <module>r     sz  8 g  : :   0 ! ) @  . , * 
		H	%hDN ' mm@4 /  >G

.299 .btm")) tmnU
299 U
pv
")) v
rc*		 c*L@% W*# W* W*t 
O0#6 O0
O0dE*BLL E*P Or!   