o
    Zh                     @   s  d Z ddlZddlmZmZmZmZmZmZ ddl	Z	ddl	m
Z
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZ ddlm Z  e!e"Z#	 dd Z$d8ddZ%dde	j&fddZ'eG dd deZ(dd Z)dd Z*dd Z+d9ddZ,G dd  d ej-Z.G d!d" d"ej-Z/G d#d$ d$ej-Z0G d%d& d&ej-Z1d'd( Z2G d)d* d*ej-Z3d+d, Z4d-d. Z5eG d/d0 d0e(Z6ed1d2G d3d4 d4e(eZ7G d5d6 d6ej8Z9g d7Z:dS ):z`PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19    N)AnyDictListOptionalTupleUnion)Tensornn)CrossEntropyLoss	LayerNorm   )ACT2FN)GenerationMixin)is_deepspeed_zero3_enabled)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringlogging   )
FSMTConfigc                 C   s   |   dksJ | dS )z+Turns 1->0, 0->1, False->True, True-> False   r   )dimeq)attention_mask r   U/var/www/auris/lib/python3.10/site-packages/transformers/models/fsmt/modeling_fsmt.pyinvert_mask   s   
r   c                 C   sT   | j d }tj|| jd}|||}|d}|r|| }||k}| |dkdS )Nr   device)shapetorcharanger!   expand	unsqueezemasked_fill)xZdiagonallr%   maskr   r   r   	triu_onnx   s   

r,   c           	      C   sn   | j }|du rt||}| \}}|du rt||}nt|}tttj|||ddj	|j
d}|||fS )z
    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
    generation
    Ndtyper   r    )pad_token_idshift_tokens_rightsizemake_padding_maskr   r,   fill_with_neg_infr$   zerostor!   )	config	input_idsdecoder_input_idsdecoder_padding_maskcausal_mask_dtyper/   bsztgt_lencausal_maskr   r   r   _prepare_fsmt_decoder_inputs   s   

r>   c                   @   s(   e Zd ZeZdZdd Zedd ZdS )PretrainedFSMTModelmodelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|t
rD|jg |jj|jR  }tj|dd}|  ||_d S t|tjrc|jjjd|d |jd ure|jj|j 	  d S d S d S )N        )meanstdF)requires_grad)r6   Zinit_std
isinstancer	   LinearweightdataZnormal_biasZzero_SinusoidalPositionalEmbeddingget_embeddingr#   padding_idx	Parameterdetach_	Embedding)selfmodulerC   rG   r   r   r   _init_weights   s"   



z!PretrainedFSMTModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r   r         r   r    )r   r7   )r6   r/   r$   tensorr!   ne)rP   Z	pad_tokenr7   dummy_inputsr   r   r   rZ      s   "z PretrainedFSMTModel.dummy_inputsN)	__name__
__module____qualname__r   Zconfig_classbase_model_prefixrR   propertyrZ   r   r   r   r   r?      s    r?   c                 C   s,   | j j\}}tj||dd}| j j|j _|S )NFrI   )rG   r#   r	   rF   rH   )embZ
vocab_sizeZemb_sizeZ	lin_layerr   r   r   _make_linear_from_emb   s   rb   c                 C   s    | |krt d|  d| d S )Nzshape mismatch: z != )AssertionError)Zshape_1Zshape2r   r   r   _check_shapes  s   rd   c                 C   sx   |  | dk| |  }| |jddd d}| d| |dddf< | ddddf |ddddf< |S )zXShift input ids one token to the right, and wrap the last non pad token (usually <eos>).ir   r   r"   Nr   )Zmasked_fill_clonerY   sumr'   gatherZsqueeze)r7   r/   Zprev_output_tokensZindex_of_eosr   r   r   r0     s   $r0   c                 C   s   |  |}| sd}|S )zTrue for pad tokensN)r   any)r7   rL   Zpadding_maskr   r   r   r2     s   
r2   c                       s,   e Zd Zdef fddZdddZ  ZS )EncoderLayerr6   c                    s   t    |j| _t| j|j|jd| _t| j| _	|j
| _
t|j | _|j| _t| j|j| _t|j| j| _t| j| _d S )N)dropout)super__init__d_model	embed_dim	AttentionZencoder_attention_headsattention_dropout	self_attnr   self_attn_layer_normrk   r   activation_functionactivation_fnactivation_dropoutr	   rF   Zencoder_ffn_dimfc1fc2final_layer_normrP   r6   	__class__r   r   rm     s   
zEncoderLayer.__init__Fc                 C   s   |}| j |||||d\}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}||fS )al  
        Args:
            x (`torch.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
            encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
                *(batch, src_len)* where padding elements are indicated by `1`.
            for t_tgt, t_src is excluded (or masked out), =0 means it is
            included in attention
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                *(config.encoder_attention_heads,)*.

        Returns:
            encoded output of shape *(seq_len, batch, embed_dim)*
        )querykeykey_padding_masklayer_head_maskoutput_attentionsptraining)rr   r	   
functionalrk   r   rs   ru   rw   rv   rx   ry   )rP   r)   encoder_padding_maskr   r   residualattn_weightsr   r   r   forward+  s&   



zEncoderLayer.forward)Fr[   r\   r]   r   rm   r   __classcell__r   r   r{   r   rj     s    rj   c                       sn   e Zd ZdZdef fddZ						ddejd	eej d
eej deej de	de	de	fddZ
  ZS )FSMTEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].

    Args:
        config: FSMTConfig
    r6   c                    s   t     j| _ j| _|j| _|| _|j} jrt	
|nd| _t j| j d || j| _t fddt jD | _d S )N      ?r   c                       g | ]}t  qS r   )rj   .0_r6   r   r   
<listcomp>b      z(FSMTEncoder.__init__.<locals>.<listcomp>)rl   rm   rk   Zencoder_layerdrop	layerdroprL   embed_tokensembedding_dimscale_embeddingmathsqrtembed_scalerJ   max_position_embeddingsembed_positionsr	   
ModuleListrangeZencoder_layerslayers)rP   r6   r   ro   r{   r   r   rm   W  s   
$zFSMTEncoder.__init__NFTr7   r   inputs_embeds	head_maskr   output_hidden_statesreturn_dictc                 C   s   |durt |}|dur|durtd|dur&| || j }| |}n0|durR|| j }|dddddf |dddddf d| jj}	| |	}ntd|| }
tj	j
|
| j
| jd}
|
dd}
|rodnd}|rudnd}|dur| d t| jksJ dt| j d	| d  d
t| jD ]@\}}|r|
dd}
||
f7 }|
dd}
tg }| jr|| jk rd}n||
||dur|| nd|d\}
}|r||f }q|
dd}
|r||
f7 }|stdd |
||fD S t|
||dS )a  
        Args:
            input_ids (`torch.LongTensor`): tokens in the source language of shape
                *(batch, src_len)*
            attention_mask (`torch.LongTensor`): indicating which indices are padding tokens
            inputs_embeds (`torch.FloatTensor`):
                embedding vectors of shape *(batch, src_len, embed_dim)*
            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        Returns:
            BaseModelOutput or Tuple comprised of:

                - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
                - **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
                  batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
                - **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   r   z&The head_mask should be specified for  layers, but it is for .)r   r   c                 s       | ]	}|d ur|V  qd S Nr   r   vr   r   r   	<genexpr>  s    z&FSMTEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)r   
ValueErrorr   r   r   r(   r   rL   r	   r   rk   r   	transposer1   lenr   	enumerater$   randr   tupler   )rP   r7   r   r   r   r   r   r   Z	embed_posposition_idsr)   Zencoder_statesZall_attentionsidxZencoder_layerdropout_probabilityZattnr   r   r   r   d  s\   !
 




zFSMTEncoder.forward)NNNFFT)r[   r\   r]   __doc__r   rm   r$   r   r   boolr   r   r   r   r{   r   r   O  s0    r   c                       s:   e Zd Zdef fddZ							dddZ  ZS )	DecoderLayerr6   c                    s   t    |j| _t| j|j|jd| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jdd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )N)ro   	num_headsrk   T)rk   encoder_decoder_attention)rl   rm   rn   ro   rp   Zdecoder_attention_headsrq   rr   rk   r   rt   ru   rv   r   rs   encoder_attnencoder_attn_layer_normr	   rF   Zdecoder_ffn_dimrw   rx   ry   rz   r{   r   r   rm     s*   
zDecoderLayer.__init__NFc
              	   C   s  |}
|d u ri }| j |||||||	d\}}tjj|| j| jd}|
| }| |}|}
| jj| j jks6J | j||||||	d\}}tjj|| j| jd}|
| }| |}|}
| 	| 
|}tjj|| j| jd}| |}tjj|| j| jd}|
| }| |}||||fS )N)r}   r~   layer_stater   	attn_maskr   r   r   )r}   r~   r   r   r   r   )rr   r	   r   rk   r   rs   r   	cache_keyr   ru   rw   rv   rx   ry   )rP   r)   encoder_hidden_statesencoder_attn_maskr   r=   r   cross_attn_layer_head_maskr9   r   r   Zself_attn_weightsZcross_attn_weightsr   r   r   r     sP   
	




zDecoderLayer.forward)NNNNNNFr   r   r   r{   r   r     s    r   c                       s   e Zd ZdZdedejf fddZdd Z											
dde	j
de	j
de	j
de	j
de	j
dee	j
 dee	j
 dee	j
 deee	j  dedededefddZ  ZS )FSMTDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]

    Args:
        config: FSMTConfig
        embed_tokens (nn.Embedding): output embedding
    r6   r   c                    s  t     j| _ j| _|j| _ jrt j	nd| _
|| _|j}t j| j d || j| _t fddt jD | _t rhdd l}|jj| jjd d | jjj}W d    n1 sbw   Y  n| jjj}tj|d |d dd| _| jj| j_d S )	Nr   r   c                    r   r   )r   r   r   r   r   r   2  r   z(FSMTDecoder.__init__.<locals>.<listcomp>r   )Zmodifier_rankFr`   )rl   rm   rk   Zdecoder_layerdropr   rL   r   r   r   rn   r   r   r   rJ   r   r   r	   r   r   Zdecoder_layersr   r   	deepspeedzeroZGatheredParametersrG   r#   rF   output_projection)rP   r6   r   ro   r   Zembed_tokens_weight_shaper{   r   r   rm   '  s(   
 
zFSMTDecoder.__init__c                 C   s   | j j| j_d S r   )r   rG   r   rP   r   r   r   _tie_weights>  s   zFSMTDecoder._tie_weightsNFTr7   r   r   r9   decoder_causal_maskr   r   cross_attn_head_maskpast_key_values	use_cacher   r   r   c                 C   s  |durt |}|dur|durtd|dur<| |}|
r3|ddddf }|ddddf }| || j }n0|durh|dddddf |dddddf d| jj}| |}|| j }ntd||7 }tj	j
|| j
| jd}|dd}|dd}|rdnd}|rdnd}|rdnd}g }t||gd	d
gD ](\}}|dur| d t| jksJ d| dt| j d| d  dqt| jD ]i\}}|r|dd}||f7 }|dd}| jrtg }|| jk rq|	dur|	| nd}||||||||dur|| nd|dur|| nd|d	\}}}}|
r.||  |r;||f7 }||f7 }q|rP|dd}||f7 }|dd}|dd}|dd}| |}|
rf|nd}|sytdd |||||fD S t|||||dS )a  
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
        EMNLP 2019).

        Args:
            input_ids (`torch.LongTensor` of shape `(batch, tgt_len)`):
                previous decoder outputs for teacher forcing
            encoder_hidden_states: output from the encoder, used for
                encoder-side attention
            encoder_padding_mask: for ignoring pad tokens
            past_key_values (dict or None): dictionary used for storing state during generation
            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        Returns:
            BaseModelOutputWithPast or tuple:

                - the decoder's features of shape *(batch, tgt_len, embed_dim)*
                - the cache
                - hidden states
                - attentions
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer"   r   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   r   r   r   zThe `z` should be specified for r   r   )r   r9   r   r=   r   r   r   c                 s   r   r   r   r   r   r   r   r     s    z&FSMTDecoder.forward.<locals>.<genexpr>)r   r   r   r   cross_attentions)r   r   r   r   r   r(   r   rL   r	   r   rk   r   r   zipr1   r   r   r   r$   r   r   appendcopyr   r   r   )rP   r7   r   r   r9   r   r   r   r   r   r   r   r   r   	positionsr)   r   Zall_hidden_statesZall_self_attnsZall_cross_attnsZnext_decoder_cacher   Z	mask_namer   Zdecoder_layerr   r   Zlayer_self_attn
layer_pastZlayer_cross_attnZ
next_cacher   r   r   r   A  s   0
 








zFSMTDecoder.forward)NNNNFFFT)r[   r\   r]   r   r   r	   rO   rm   r   r$   r   r   r   FloatTensorr   r   r   r   r   r{   r   r     sN    
	
r   c                 C   s.   |   D ]\}}|d ur|d|| |< q| S )Nr   )itemsZindex_select)
attn_cacheZ	new_orderkZinput_buffer_kr   r   r   _reorder_buffer  s
   r   c                       s   e Zd ZdZ			d fdd	Zdd Z									dd
ee dee deee	ee f  dee dee de
eee f fddZdd Z  ZS )rp   z=Multi-headed attention from 'Attention Is All You Need' paperrA   TFc                    s   t    || _|| _|| _|| | _| j| | jksJ d| jd | _|| _tj	|||d| _
tj	|||d| _tj	|||d| _tj	|||d| _| jrTd| _d S d| _d S )Nz(embed_dim must be divisible by num_headsg      r`   Zencoder_decoderrP   )rl   rm   ro   r   rk   head_dimscalingr   r	   rF   k_projv_projq_projout_projr   )rP   ro   r   rk   rI   r   r{   r   r   rm     s   

zAttention.__init__c                 C   s"   |  ||| j | jddS )Nr   r   )
contiguousviewr   r   r   )rP   rX   seq_lenr;   r   r   r   _shape  s   "zAttention._shapeNr~   r   r   r   r   returnc                 C   s  | j }| \}	}
}|| jksJ t| |	|
|gksJ |dur2|| ji }d|v r1|r1d}nd}i }| || j }|rT|du rId }}n| |}| 	|}n
| |}| 	|}| 
||	|
}|durp| 
|d|
}|dur{| 
|d|
}|dur| ||||||
\}}}||
| jd| j||
| jd| j|s|ndd|| j< |dusJ |d}t||dd}| |
| j |	|fksJ |dur||
| j|	|| }||
| j |	|}|dur| dkrd}|du s| dd |
|fksJ |dur0||
| j|	|}|dd}||t|jj}||
| j |	|}tjj|dd}|duro| | jfksTJ d	| jf d
|  |dddd||
| j|	| }||
| j |	|}|r||
| j|	|}||
| j |	|}nd}tjj|| j| jd}|dusJ t||}| |
| j |	| jfksJ |dd |	|
|}| |}||fS )z+Input shape: Time(SeqLen) x Batch x ChannelNprev_keyr"   )r   
prev_valueprev_key_padding_maskr   r   r   re   z/Head mask for a single layer should be of size z	, but is r   )r   r1   ro   listgetr   r   r   r   r   r   _use_saved_stater   r   r   r$   Zbmmr   r   r'   r(   finfor.   minr	   r   Zsoftmaxrk   r   r   r   )rP   r}   r~   r   r   r   r   r   	static_kvr<   r;   ro   saved_stateqr   r   Zsrc_lenr   ZreshapedZattn_weights_reshapedZ
attn_probsZattn_outputr   r   r   r     s   







" 
zAttention.forwardc                 C   s  d|v r-|d }|d usJ | || j d| j}|r|}n|d us$J tj||gdd}d|v rZ|d }	|	d us;J |	 || j d| j}
|rK|
}n|d usQJ tj|
|gdd}|d urb|d usdJ |dd }|d ur}|rs|}ntj||gdd}n|}|||fS )Nr   r"   r   re   r   r   )r   r   r   r$   catr   )rP   r   r   r   r   r   r;   Z	_prev_keyr   Z_prev_valuer   r   Znew_key_padding_maskr   r   r   r   d  s0   
zAttention._use_saved_state)rA   TF)NNNNF)r[   r\   r]   r   rm   r   r   r   r   strr   r   r   r   r   r   r{   r   rp     s4    	
irp   c                 C   s   |   t| jj| S )z:FP16-compatible function that fills a input_ids with -inf.)floatZfill_r$   r   r.   r   type_astr   r   r   r3     s   r3   c                 C   s   t | dd S )Nr#   )getattrr   r   r   r   
_get_shape     r   c                $       s2  e Zd ZddgZdef fddZdd Zdd	 Zd
d Ze															d'de
jdee
j dee
j dee
j dee
j dee
j dee
j deee
j  deee
j  dee dee dee dee
j dee
j dee deee
j ef f ddZdd  Zd!d" Zd#d$ Zd%d& Z  ZS )(	FSMTModeldecoder.embed_tokens.weight decoder.output_projection.weightr6   c                    sZ   t  | |j}t|j|j|}t|j|j|}t||| _	t
||| _|   d S r   )rl   rm   r/   r	   rO   Zsrc_vocab_sizern   tgt_vocab_sizer   encoderr   decoder	post_init)rP   r6   rL   Zencoder_embed_tokensZdecoder_embed_tokensr{   r   r   rm     s   zFSMTModel.__init__c                 C      | j S r   )r   r   r   r   r   get_encoder     zFSMTModel.get_encoderc                 C   r  r   )r   r   r   r   r   get_decoder  r  zFSMTModel.get_decoderc                 C   s8   | j jr| | jj|   | | jj|   d S d S r   )r6   Ztie_word_embeddingsZ_tie_or_clone_weightsr   r   get_input_embeddingsr   r   r   r   r   r     s   zFSMTModel._tie_weightsNr7   r   r8   decoder_attention_maskr   decoder_head_maskr   encoder_outputsr   r   r   r   r   decoder_inputs_embedsr   r   c                 C   sr  |du rd}
|dur|n| j j}|dur|n| j j}|
dur |
n| j j}
|dur*|n| j j}|
sF|durFt| j |||| jjjj	d\}}}nd\}}|du rV|du rVt
d|du rg| j|||||||d}n$|rt|tst|d t|dkr||d ndt|d	kr|d	 ndd
}| j||d |||||||	|
|||d}|s|| S t|j|j|j|j|j|j|j|jdS )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NF)r8   r9   r:   NNzIMake sure that `decoder_input_ids` or `decoder_inputs_embeds` are passed.)r7   r   r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)r6   r   r   r   use_return_dictr>   r   r   rG   r.   r   r   rE   r   r   r   r   r   r   r   r   )rP   r7   r   r8   r  r   r  r   r  r   r   r   r   r   r	  r   r9   r=   Zdecoder_outputsr   r   r   r     sz   (

zFSMTModel.forwardc                 C      | j jS r   r   r   r   r   r   r   r       zFSMTModel.get_input_embeddingsc                 C      || j _d S r   r  rP   valuer   r   r   set_input_embeddings  r   zFSMTModel.set_input_embeddingsc                 C   r  r   r   r   r   r   r   r   get_output_embeddings"  r  zFSMTModel.get_output_embeddingsc                 C   r  r   r  r  r   r   r   set_output_embeddings%  r   zFSMTModel.set_output_embeddings)NNNNNNNNNNNNNN)r[   r\   r]   _tied_weights_keysr   rm   r  r  r   r   r$   
LongTensorr   r   
BoolTensorr   r   r   r   r   r   r  r  r  r  r   r   r   r{   r   r     st    	
qr   zV
    The FSMT Model with a language modeling head. Can be used for summarization.
    )Zcustom_introc                &       sL  e Zd ZdZddgZdef fddZe																d'dee	j
 d	ee	j d
ee	j
 dee	j dee	j dee	j dee	j deee	j  deee	j  dee	j dee	j dee	j
 dee dee dee dee deee	j ef f"ddZde	jfddZedd Zdd  Zd!d" Zd#d$ Zd%d& Z  ZS )(FSMTForConditionalGenerationr@   r   r   r6   c                    s&   t  | t|}|| _|   d S r   )rl   rm   r   r@   r   )rP   r6   Z
base_modelr{   r   r   rm   2  s   z%FSMTForConditionalGeneration.__init__Nr7   r   r8   r  r   r  r   r  r   r   r	  labelsr   r   r   r   r   c                 C   s   |dur|n| j j}|durd}| j||
|||||||||	||||d}|d }d}|dur?t }||d| j j|d}|sU|f|dd  }|durS|f| S |S t|||j|j|j	|j
|j|j|jd	S )uO	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Translation:

        ```python
        >>> from transformers import AutoTokenizer, FSMTForConditionalGeneration

        >>> mname = "facebook/wmt19-ru-en"
        >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
        >>> tokenizer = AutoTokenizer.from_pretrained(mname)

        >>> src_text = "Машинное обучение - это здорово, не так ли?"
        >>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        "Machine learning is great, isn't it?"
        ```
        NF)r   r   r8   r	  r  r  r   r  r   r   r   r   r   r   r   r"   r   )	ZlossZlogitsr   r  r  r   r  r   r  )r6   r  r@   r
   r   r   r   r   r  r  r   r  r   r  )rP   r7   r   r8   r  r   r  r   r  r   r   r	  r  r   r   r   r   ZoutputsZ	lm_logitsZmasked_lm_lossZloss_fctoutputr   r   r   r   :  sN   =z$FSMTForConditionalGeneration.forwardc                 C   s   t || jjS r   )r0   r6   r/   )rP   r  r   r   r   %prepare_decoder_input_ids_from_labels     zBFSMTForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s2   g }| D ]} fdd|  D }|| q|S )Nc                    s   i | ]
\}}|t | qS r   )r   )r   Zattn_keyr   beam_idxr   r   
<dictcomp>  s    z?FSMTForConditionalGeneration._reorder_cache.<locals>.<dictcomp>)r   r   )r   r#  Zreordered_pastr   Zlayer_past_newr   r"  r   _reorder_cache  s   
z+FSMTForConditionalGeneration._reorder_cachec                 C   r  r   )r@   r   r   r   r   r   r    r  z(FSMTForConditionalGeneration.get_encoderc                 C   r  r   )r@   r   r   r   r   r   r    r  z(FSMTForConditionalGeneration.get_decoderc                 C   s
   | j jjS r   r@   r   r   r   r   r   r   r    s   
z2FSMTForConditionalGeneration.get_output_embeddingsc                 C   s   || j j_d S r   r&  r  r   r   r   r    r!  z2FSMTForConditionalGeneration.set_output_embeddings)NNNNNNNNNNNNNNNN)r[   r\   r]   r^   r  r   rm   r   r   r$   r  r   r  r   r   r   r   r   r   r   staticmethodr%  r  r  r  r  r   r   r   r{   r   r  )  s~    	
j

r  c                       sj   e Zd ZdZ fddZdd Zedd Zedefd	d
Z			dde
e de
e f fddZ  ZS )rJ   a<  
    This module produces sinusoidal positional embeddings of any length.

    We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.

    Padding symbols are ignored.

    These embeddings get automatically extended in forward if more positions is needed.
    c                    s   t  ||| d S r   )rl   rm   )rP   num_positionsr   rL   r{   r   r   rm     s   z&SinusoidalPositionalEmbedding.__init__c                 C   sF   |  |||}|j| jj| jjd}t|| _| j  d| j_d S )N)r.   r!   F)	rK   r5   rG   r.   r!   r	   rM   rN   rD   )rP   r(  r   rL   rG   r   r   r   make_weight  s
   
z)SinusoidalPositionalEmbedding.make_weightc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r-   r   re   r"   N)r   logr$   expr%   Zint64r   r'   r   sincosr   r4   )Znum_embeddingsr   rL   Zhalf_dimra   r   r   r   rK     s    $&z+SinusoidalPositionalEmbedding.get_embeddingrL   c                 C   s.   |  | }tj|dd||  | S )z
        Replace non-padding symbols with their position numbers.

        Position numbers begin at padding_idx+1. Padding symbols are ignored.
        r   re   )rY   intr$   Zcumsumr   long)rX   rL   r+   r   r   r   make_positions  s    z,SinusoidalPositionalEmbedding.make_positionsNincremental_statetimestepc                    s\   |j dd \}}| jd | }|| jdkr!| || j| j | || j}t |S )z/Input is expected to be of size [bsz x seqlen].Nr   r   r   )	r#   rL   rG   r1   r)  r   r0  rl   r   )rP   inputr1  r2  r;   r   Zmax_posr   r{   r   r   r     s   z%SinusoidalPositionalEmbedding.forwardr
  )r[   r\   r]   r   rm   r)  r'  rK   r.  r0  r   r   r   r   r   r   r   r{   r   rJ     s    

rJ   )r  r   r?   )r   )r   );r   r   typingr   r   r   r   r   r   r$   r   r	   Ztorch.nnr
   r   Zactivationsr   Z
generationr   Zintegrations.deepspeedr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   Zconfiguration_fsmtr   Z
get_loggerr[   loggerr   r,   Zfloat32r>   r?   rb   rd   r0   r2   Modulerj   r   r   r   r   rp   r3   r   r   r  rO   rJ   __all__r   r   r   r   <module>   s^    
4G


1wX : '  H