o
    Zh0                    @   s|  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* e(+e,Z-dd Z.d=ddZ/eG dd de&Z0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd dej1Z6G d d! d!ej1Z7G d"d# d#ej1Z8e'G d$d% d%e Z9eG d&d' d'e&Z:e'G d(d) d)e9Z;G d*d+ d+ej1Z<e'd,d-G d.d/ d/e9eZ=e'd0d-G d1d2 d2e9Z>e'd3d-G d4d5 d5e9Z?e'G d6d7 d7e9Z@e'G d8d9 d9e9ZAe'G d:d; d;e9ZBg d<ZCdS )>z
PyTorch XLM model.
    N)	dataclass)CallableDictOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )geluget_activation)GenerationMixin)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )	XLMConfigc              	      s   t  fddt| D }d|_tt |d d dd df |d d dd df< tt |d d dd df |d d dd df< |  d S )Nc                    s$   g | ]  fd dt D qS )c              	      s(   g | ]}t d d|d     qS )i'     )nppower).0j)dimpos S/var/www/auris/lib/python3.10/site-packages/transformers/models/xlm/modeling_xlm.py
<listcomp>1   s   ( z;create_sinusoidal_embeddings.<locals>.<listcomp>.<listcomp>)range)r"   r$   )r%   r'   r(   1   s   $ z0create_sinusoidal_embeddings.<locals>.<listcomp>Fr   r   r   )	r    arrayr)   Zrequires_gradtorchFloatTensorsincosZdetach_)Zn_posr$   outZposition_encr&   r*   r'   create_sinusoidal_embeddings0   s
   44r1   c                 C   s   t j| t j|jd}|dur|}n|  | ksJ ||dddf k }|d}|rD|ddddf || d|ddddf k}n|}| || fksPJ |du s_| || | fks_J ||fS )zH
    Generate hidden states mask, and optionally an attention mask.
    dtypedeviceNr   r   F)r,   arangelongr4   maxitemsizerepeat)slenlengthscausalpadding_maskalenmaskbs	attn_maskr&   r&   r'   	get_masks8   s   
0rC   c                   @   s~   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dS )	XLMSquadHeadOutputa  
    Base class for outputs of question answering models using a [`~modeling_utils.XLMSQuADHead`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.

    Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits)__name__
__module____qualname____doc__rE   r   r,   r-   __annotations__rF   rG   
LongTensorrH   rI   rJ   r&   r&   r&   r'   rD   Q   s   
 rD   c                       sJ   e Zd ZdZdef fddZ	ddejdeej dejfd	d
Z	  Z
S )XLMPoolerStartLogitsz
    Compute SQuAD start logits from sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    configc                    s   t    t|jd| _d S Nr   )super__init__r   Linearhidden_sizedenseselfrR   	__class__r&   r'   rU   y   s   
zXLMPoolerStartLogits.__init__Nhidden_statesp_maskreturnc                 C   sT   |  |d}|dur(|jtjkr|d|  d|  }|S |d|  d|  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        Nr     ꌠ9Y>)F)rX   squeezer3   r,   float16)rZ   r]   r^   xr&   r&   r'   forward}   s   zXLMPoolerStartLogits.forwardN)rK   rL   rM   rN   r   rU   r,   r-   r   rf   __classcell__r&   r&   r[   r'   rQ   p   s    rQ   c                       sb   e Zd ZdZdef fddZ			ddejdeej deej	 d	eej d
ejf
ddZ
  ZS )XLMPoolerEndLogitsz
    Compute SQuAD end logits from sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    rR   c                    sR   t    t|jd |j| _t | _tj|j|j	d| _t|jd| _
d S )Nr   Zepsr   )rT   rU   r   rV   rW   dense_0Tanh
activation	LayerNormlayer_norm_epsdense_1rY   r[   r&   r'   rU      s
   

zXLMPoolerEndLogits.__init__Nr]   start_statesstart_positionsr^   r_   c                 C   s   |dus|dusJ d|dur4|j dd \}}|ddddf dd|}|d|}|d|d}| tj||gdd}| |}| |}| |	d}|durr|j
tjkrh|d|  d|  }|S |d|  d|  }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        N7One of start_states, start_positions should be not Noner`   r*   r   ra   rb   )shapeexpandgatherrk   r,   catrm   rn   rp   rc   r3   rd   )rZ   r]   rq   rr   r^   r;   hszre   r&   r&   r'   rf      s$   

zXLMPoolerEndLogits.forwardNNNrK   rL   rM   rN   r   rU   r,   r-   r   rP   rf   rh   r&   r&   r[   r'   ri      s"    	
ri   c                       sb   e Zd ZdZdef fddZ			ddejdeej deej	 d	eej	 d
ejf
ddZ
  ZS )XLMPoolerAnswerClassz
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    rR   c                    sB   t    t|jd |j| _t | _tj|jddd| _d S )Nr   r   Fbias)	rT   rU   r   rV   rW   rk   rl   rm   rp   rY   r[   r&   r'   rU      s   

zXLMPoolerAnswerClass.__init__Nr]   rq   rr   	cls_indexr_   c                 C   s   |j d }|dus|dusJ d|dur,|ddddf dd|}|d|d}|durH|ddddf dd|}|d|d}n|dddddf }| tj||gdd}| |}| |d}|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        r`   Nrs   rt   r*   )	ru   rv   rw   rc   rk   r,   rx   rm   rp   )rZ   r]   rq   rr   r   ry   Zcls_token_statere   r&   r&   r'   rf      s   

zXLMPoolerAnswerClass.forwardrz   r{   r&   r&   r[   r'   r|      s"    	r|   c                       s   e Zd ZdZdef fddZe						ddejde	ej
 d	e	ej
 d
e	ej
 de	ej
 de	ej dedeeeej f fddZ  ZS )XLMSQuADHeadz
    A SQuAD head inspired by XLNet.

    Args:
        config ([`XLMConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    rR   c                    s<   t    |j| _|j| _t|| _t|| _t|| _	d S rg   )
rT   rU   start_n_top	end_n_toprQ   start_logitsri   
end_logitsr|   answer_classrY   r[   r&   r'   rU   &  s   


zXLMSQuADHead.__init__NFr]   rr   end_positionsr   is_impossibler^   return_dictr_   c                 C   s  | j ||d}|durk|durk||||fD ]}	|	dur&|	 dkr&|	d q| j|||d}
t }|||}||
|}|| d }|dura|dura| j|||d}t }|||}||d 7 }|rht|d	S |fS |	 \}}}tj
j|dd
}tj|| jdd
\}}|ddd|}t|d|}|dd|dd}|d|}|dur|dnd}| j|||d}
tj
j|
dd
}tj|| jdd
\}}|d| j| j }|d| j| j }td||}| j|||d}|s|||||fS t|||||dS )a  
        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
            Final hidden states of the model on the sequence tokens.
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Positions of the first token for the labeled span.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Positions of the last token for the labeled span.
        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Whether the question has a possible answer in the paragraph or not.
        p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
            should be masked.
        )r^   Nr   r`   )rr   r^   r   )rr   r   g      ?)rE   r*   rt   )rq   r^   z
blh,bl->bh)rq   r   )rF   rG   rH   rI   rJ   )r   r$   Zsqueeze_r   r
   r   r   r	   rD   r9   
functionalsoftmaxr,   Ztopkr   	unsqueezerv   rw   	expand_asr   viewZeinsum)rZ   r]   rr   r   r   r   r^   r   r   re   r   loss_fct
start_lossend_loss
total_lossrJ   Zloss_fct_clsZcls_lossZbszr;   ry   Zstart_log_probsrF   rG   Zstart_top_index_exprq   Zhidden_states_expandedZend_log_probsrH   rI   r&   r&   r'   rf   /  s^   






zXLMSQuADHead.forward)NNNNNF)rK   rL   rM   rN   r   rU   r   r,   r-   r   rP   boolr   rD   r   rf   rh   r&   r&   r[   r'   r     s6    			r   c                       sJ   e Zd ZdZdef fddZ	ddejdeej	 dejfd	d
Z
  ZS )XLMSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`XLMConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    rR   c                    s   t    t|dd| _| jdkrtt | _t|dr<|j	r<t|dr1|j
r1|jdkr1|j}n|j}t|j|| _t|dd }|rHt|nt | _t | _t|drc|jdkrct|j| _t | _t|d	r{|jdkr}t|j| _d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   Zsummary_activationsummary_first_dropoutsummary_last_dropout)rT   rU   getattrr   NotImplementedErrorr   ZIdentitysummaryhasattrr   r   
num_labelsrW   rV   r   rm   first_dropoutr   Dropoutlast_dropoutr   )rZ   rR   Znum_classesZactivation_stringr[   r&   r'   rU     s&   




zXLMSequenceSummary.__init__Nr]   r   r_   c                 C   s  | j dkr|dddf }ne| j dkr|dddf }nW| j dkr(|jdd}nK| j d	krl|du rItj|d
ddddf |jd d tjd}n|dd}|d| d  |	df }|
d|d}n| j dkrst| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r   Nr`   firstr   meanr   r*   r   .rt   )r3   )r`   r   )r   r   r,   	full_likeru   r6   r   rv   r$   r9   rw   rc   r   r   r   rm   r   )rZ   r]   r   outputr&   r&   r'   rf     s.   



"




zXLMSequenceSummary.forwardrg   r{   r&   r&   r[   r'   r     s    r   c                       s6   e Zd Ze Z fddZdd Zd	ddZ  Z	S )
MultiHeadAttentionc                    s   t    ttj| _|| _|| _|j| _	| j| j dksJ t
||| _t
||| _t
||| _t
||| _t | _d S )Nr   )rT   rU   nextr   NEW_IDlayer_idr$   n_headsattention_dropoutdropoutr   rV   q_link_linv_linout_linsetpruned_heads)rZ   r   r$   rR   r[   r&   r'   rU     s   
zMultiHeadAttention.__init__c                 C   s   | j | j }t|dkrd S t|| j|| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt| | _|| j | _ | j
|| _d S )Nr   r   r*   )r$   r   lenr   r   r   r   r   r   r   union)rZ   headsZattention_head_sizeindexr&   r&   r'   prune_heads   s   zMultiHeadAttention.prune_headsNFc                    s"  |  \ }}|du r|du r|n|d | }	n| d}	j}
j|
 | dkr1 d||	fn dd|	f} fdd} fdd}||}|du ra||}||}n|du sjj|vr|| }}||}||}|durj|v r|du r|j \}}tj||gd	d
}tj||gd	d
}n|j \}}||f|j< |t	
 }t||d	d}|dk||}||t|jj tjj| dd
|}tjj|jjd}|dur|| }t||}||}|f}|r||f }|S )zd
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        Nr;   r   r   c                    s   |   djddS )Z
projectionr`   r   r   )r   r   	transposere   rA   Zdim_per_headrZ   r&   r'   ru     s   z)MultiHeadAttention.forward.<locals>.shapec                    s    |  dd  dj S )zcompute contextr   r   r`   )r   
contiguousr   r   r   r   r&   r'   unshape#  s    z+MultiHeadAttention.forward.<locals>.unshaper   r*   r   r`   ptraining)r9   r   r$   r   r   r   r   r,   rx   mathsqrtmatmulr   r   r   Zmasked_fill_Zfinfor3   minr   r   r   floatZtype_asr   r   r   )rZ   inputr@   kvcache	head_maskoutput_attentionsqlenr$   Zklenr   Zmask_reshaperu   r   qkvZk_Zv_scoresweightscontextoutputsr&   r   r'   rf     sN   

$

zMultiHeadAttention.forward)NNNF)
rK   rL   rM   	itertoolscountr   rU   r   rf   rh   r&   r&   r[   r'   r     s
    r   c                       s,   e Zd Z fddZdd Zdd Z  ZS )TransformerFFNc                    sT   t    |j| _t||| _t||| _|jrtntj	j
| _|j| _d| _d S rS   )rT   rU   r   r   rV   lin1lin2Zgelu_activationr   r   Zreluactchunk_size_feed_forwardseq_len_dim)rZ   Zin_dimZ
dim_hiddenZout_dimrR   r[   r&   r'   rU   P  s   

zTransformerFFN.__init__c                 C   s   t | j| j| j|S rg   )r   ff_chunkr   r   )rZ   r   r&   r&   r'   rf   Y  s   zTransformerFFN.forwardc                 C   s8   |  |}| |}| |}tjj|| j| jd}|S )Nr   )r   r   r   r   r   r   r   )rZ   r   re   r&   r&   r'   r   \  s
   


zTransformerFFN.ff_chunk)rK   rL   rM   rU   rf   r   rh   r&   r&   r[   r'   r   O  s    	r   c                       s<   e Zd ZeZdZdZ fddZedd Z	dd Z
  ZS )	XLMPreTrainedModelNtransformerc                    s   t  j|i | d S rg   )rT   rU   )rZ   inputskwargsr[   r&   r'   rU   j  s   zXLMPreTrainedModel.__init__c                 C   sz   t g dg dg dg}t g dg dg dg}| jjr5| jjdkr5t g dg dg dg}nd }|||dS )	N)      r   r   r   )r   r   r   r   r   )r   r   r         )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )	input_idsattention_masklangs)r,   tensorrR   use_lang_embn_langs)rZ   Zinputs_listZ
attns_listZ
langs_listr&   r&   r'   dummy_inputsm  s   zXLMPreTrainedModel.dummy_inputsc                 C   s  t |tjr+| jdur| jjdurtjj|jd| jjd |jdur+|jj	|j 
  t |tjrU| jdurU| jjdurUtjj|jd| jjd |jdurUtj|jd t |tjrh|jj	
  |jj	d t |tr| jjrt| jj| jj|jjd dS dS dS )zInitialize the weights.Nr   )r   Zstdg        g      ?)r0   )
isinstancer   	EmbeddingrR   Zembed_init_stdinitZnormal_weightpadding_idxdataZzero_rV   Zinit_stdr~   Z	constant_rn   Zfill_XLMModelZsinusoidal_embeddingsr1   max_position_embeddingsemb_dimposition_embeddings)rZ   moduler&   r&   r'   _init_weightsw  s$   


z XLMPreTrainedModel._init_weights)rK   rL   rM   r   Zconfig_classZload_tf_weightsZbase_model_prefixrU   propertyr   r   rh   r&   r&   r[   r'   r   d  s    
	r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dZeeejd	f  ed
< dZeeejd	f  ed< dS )XLMForQuestionAnsweringOutputa<
  
    Base class for outputs of question answering models using a `XLMSQuADHead`.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NrE   rF   rG   rH   rI   rJ   .r]   
attentions)rK   rL   rM   rN   rE   r   r,   r-   rO   rF   rG   rP   rH   rI   rJ   r]   r   r   r&   r&   r&   r'   r     s   
 r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																								dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 deeee	j
f  dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )r   c              	      s8  t  | |j| _|j | _| jrtd|j| _|j| _|j| _|j| _|j	| _	|j
| _
|j| _| jd | _|j| _|j| _|j| _|j| _| j| j dksTJ dt|j| j| _|jdkrn|jrnt| j| j| _tj| j| j| j
d| _tj| j|jd| _t | _t | _t | _t | _t | jD ]8}| j!t"| j| j|d | j!tj| j|jd | j!t#| j| j| j|d | j!tj| j|jd qt$|d	r|j%& ' }i |_%|D ]\}}| jt(| j|jkr| )t(|t*t+t(|i q| ,  | j-d
t./|j0ddd d S )Nz,Currently XLM can only be used as an encoderr   r   z-transformer dim must be a multiple of n_headsr   )r   rj   )rR   r   position_ids)r   r`   F)
persistent)1rT   rU   Z
is_encoderZ
is_decoderr   r=   r   r   n_wordsZ	eos_index	pad_indexr   r$   Z
hidden_dimr   n_layersr   r   r   r   r   r   lang_embeddings
embeddingsrn   ro   layer_norm_embZ
ModuleListr   layer_norm1ffnslayer_norm2r)   appendr   r   r   r   copyitemsintr   listmap	post_initZregister_bufferr,   r5   rv   )rZ   rR   _r   layerr   r[   r&   r'   rU     sV   





zXLMModel.__init__c                 C   s   | j S rg   r   rZ   r&   r&   r'   get_input_embeddings  s   zXLMModel.get_input_embeddingsc                 C   s
   || _ d S rg   r  rZ   Znew_embeddingsr&   r&   r'   set_input_embeddings  s   
zXLMModel.set_input_embeddingsc                 C   s&   |  D ]\}}| j| | qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)r  r   r   )rZ   Zheads_to_pruner  r   r&   r&   r'   _prune_heads  s   zXLMModel._prune_headsNr   r   r   token_type_idsr   r<   r   r   inputs_embedsr   output_hidden_statesr   r_   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur)| \}}n
|	 dd \}}|dur:|jn|	j}|du r[|durQ|| jkjdd }n
t	j
|g| |d}|d|ksdJ |  |ksnJ t||| j|d\}}|du r| jddd|f }n
| ||fksJ |dur| ||fksJ | || j j}|dur|dur||d  }|dd| df }|dd| df }|dur|dd| df }|dd| df }|dd| df }|	du r| |}	|	| ||	 }|dur| jr| jdkr|| | }|dur'|| | }| |}tjj|| j| jd	}||d|j9 }|rGd
nd}|
rNd
nd}t | jD ]X}|r_||f }| j!| ||||| |
d}|d }|
r{||d f }tjj|| j| jd	}|| }| j"| |}|| j#| | }| j$| |}||d|j9 }qU|r||f }|dur|d  |d7  < |st%dd |||fD S t&|||dS )$  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        Nr`   r   r*   )r4   r   )r>   r;   r   r&   )r   r   r   c                 s   s    | ]	}|d ur|V  qd S rg   r&   )r"   r   r&   r&   r'   	<genexpr>  s    z#XLMModel.forward.<locals>.<genexpr>)Zlast_hidden_stater]   r   )'rR   r   r  use_return_dictr9   r4   r   sumr6   r,   r   r7   r8   rC   r=   r   Zget_head_maskr   r   r   r   r   r   r   r  r   r   r   r   r   tor3   r)   r   r  r  r  tupler   )rZ   r   r   r   r  r   r<   r   r   r  r   r  r   r   rA   r;   r4   r@   rB   Z_slenr   r]   r   iZattn_outputsr   r&   r&   r'   rf     s   &






zXLMModel.forward)NNNNNNNNNNNN)rK   rL   rM   rU   r  r  r  r   r   r,   Tensorr   strr   r   r   r   rf   rh   r&   r&   r[   r'   r     sZ    F	

r   c                       s*   e Zd ZdZ fddZdddZ  ZS )XLMPredLayerz?
    Prediction layer (cross_entropy or adaptive_softmax).
    c                    sj   t    |j| _|j| _|j| _|j}|jdu r%tj||jdd| _d S tj	||j|j
|jdd| _d S )NFTr}   )Zin_featuresZ	n_classesZcutoffsZ	div_valueZ	head_bias)rT   rU   asmr   r   r   r   rV   projZAdaptiveLogSoftmaxWithLossZasm_cutoffsZasm_div_value)rZ   rR   r$   r[   r&   r'   rU     s   

zXLMPredLayer.__init__Nc                 C   s   d}| j du r-| |}|f| }|dur+tjj|d| j|ddd}|f| }|S | j|}|f| }|durI| ||\}}|f| }|S )z,Compute the loss, and optionally the scores.r&   FNr`   r   )Z	reduction)r!  r"  r   r   Zcross_entropyr   r   Zlog_prob)rZ   re   yr   r   rE   r  r&   r&   r'   rf     s   


"


zXLMPredLayer.forwardrg   )rK   rL   rM   rN   rU   rf   rh   r&   r&   r[   r'   r     s    r   z
    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )Zcustom_introc                        s   e Zd ZdgZ fddZdd Zdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	eee
jf  de	e
j de	e
j de	e
j de	e de	e de	e deeef fddZ  ZS )XLMWithLMHeadModelzpred_layer.proj.weightc                    ,   t  | t|| _t|| _|   d S rg   )rT   rU   r   r   r   
pred_layerr  rY   r[   r&   r'   rU     s   

zXLMWithLMHeadModel.__init__c                 C   s   | j jS rg   r&  r"  r  r&   r&   r'   get_output_embeddings  s   z(XLMWithLMHeadModel.get_output_embeddingsc                 C   s   || j _d S rg   r'  r  r&   r&   r'   set_output_embeddings  s   z(XLMWithLMHeadModel.set_output_embeddingsc                 K   sj   | j j}| j j}|jd }tj|df|tj|jd}tj||gdd}|d ur.t	||}nd }||dS )Nr   r   r2   r*   )r   r   )
rR   mask_token_idlang_idru   r,   fullr6   r4   rx   r   )rZ   r   r   r*  r+  Zeffective_batch_sizeZ
mask_tokenr   r&   r&   r'   prepare_inputs_for_generation  s   

z0XLMWithLMHeadModel.prepare_inputs_for_generationNr   r   r   r  r   r<   r   r   r  labelsr   r  r   r_   c                 K   s   |dur|n| j j}| j|f||||||||	|||d|}|d }| ||
}|s3||dd  S t|
dur<|d nd|
du rE|d n|d |j|jdS )a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Nr   r   r  r   r<   r   r   r  r   r  r   r   r   rE   logitsr]   r   )rR   r  r   r&  r   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  r.  r   r  r   r   transformer_outputsr   r   r&   r&   r'   rf     s8   +zXLMWithLMHeadModel.forwardNNNNNNNNNNNNN)rK   rL   rM   Z_tied_weights_keysrU   r(  r)  r-  r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r$    sb    	

r$  z
    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    c                           e Zd Z fddZe													ddeej deej deej deej deej d	eej d
eee	ejf  deej deej deej dee
 dee
 dee
 deeef fddZ  ZS )XLMForSequenceClassificationc                    s:   t  | |j| _|| _t|| _t|| _|   d S rg   )	rT   rU   r   rR   r   r   r   sequence_summaryr  rY   r[   r&   r'   rU   O  s   

z%XLMForSequenceClassification.__init__Nr   r   r   r  r   r<   r   r   r  r.  r   r  r   r_   c                 C   sn  |dur|n| j j}| j|||||||||	|||d}|d }| |}d}|
dur| j jdu rT| jdkr:d| j _n| jdkrP|
jtjksK|
jtj	krPd| j _nd| j _| j jdkrrt
 }| jdkrl|| |
 }n+|||
}n%| j jdkrt }||d| j|
d}n| j jdkrt }|||
}|s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr/  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr`   r0  )rR   r  r   r6  Zproblem_typer   r3   r,   r6   r  r   rc   r
   r   r	   r   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  r.  r   r  r   r2  r   r1  rE   r   r&   r&   r'   rf   Z  sZ   *


"


z$XLMForSequenceClassification.forwardr3  )rK   rL   rM   rU   r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r5  H  sZ    	

r5  z
    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                "       s   e Zd Z fddZe														ddeej deej deej deej deej d	eej d
eee	ejf  deej deej deej deej dee
 dee
 dee
 deeef fddZ  ZS )XLMForQuestionAnsweringSimplec                    s4   t  | t|| _t|j|j| _| 	  d S rg   )
rT   rU   r   r   r   rV   rW   r   
qa_outputsr  rY   r[   r&   r'   rU     s   
z&XLMForQuestionAnsweringSimple.__init__Nr   r   r   r  r   r<   r   r   r  rr   r   r   r  r   r_   c                 C   sN  |dur|n| j j}| j|||||||||	|||d}|d }| |}|jddd\}}|d }|d }d}|
dur|durt|
 dkrR|
d}
t| dkr_|d}|d}|
	d|}
|	d|}t
|d}|||
}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
r  Nr/  r   r   r`   r*   )Zignore_indexr   )rE   r   r   r]   r   )rR   r  r   r8  splitrc   r   r   r9   clampr
   r   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  rr   r   r   r  r   r2  sequence_outputr1  r   r   r   Zignored_indexr   r   r   r   r&   r&   r'   rf     sV   '






z%XLMForQuestionAnsweringSimple.forward)NNNNNNNNNNNNNN)rK   rL   rM   rU   r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r7    s`    		

r7  c                (       s  e Zd Z fddZe																	ddeej deej deej deej deej d	eej d
eee	ejf  deej deej deej deej deej deej deej dee
 dee
 dee
 deeef f$ddZ  ZS )XLMForQuestionAnsweringc                    r%  rg   )rT   rU   r   r   r   r8  r  rY   r[   r&   r'   rU   *  s   

z XLMForQuestionAnswering.__init__Nr   r   r   r  r   r<   r   r   r  rr   r   r   r   r^   r   r  r   r_   c                 C   s   |dur|n| j j}| j|||||||||	|||d}|d }| j||
|||||d}|s5||dd  S t|j|j|j|j|j	|j
|j|jdS )a]  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the classification token to use as input for computing plausibility of the
            answer.
        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
            masked. 0.0 mean token is not masked.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, XLMForQuestionAnswering
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
        >>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
        ...     0
        ... )  # Batch size 1
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        ```Nr/  r   )rr   r   r   r   r^   r   r   )rE   rF   rG   rH   rI   rJ   r]   r   )rR   r  r   r8  r   rE   rF   rG   rH   rI   rJ   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  rr   r   r   r   r^   r   r  r   r2  r   r   r&   r&   r'   rf   3  sJ   D
zXLMForQuestionAnswering.forward)NNNNNNNNNNNNNNNNN)rK   rL   rM   rU   r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r<  (  sr    		

r<  c                        r4  )XLMForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t|j	|j| _
|   d S rg   )rT   rU   r   r   r   r   r   r   rV   rW   
classifierr  rY   r[   r&   r'   rU     s   
z"XLMForTokenClassification.__init__Nr   r   r   r  r   r<   r   r   r  r.  r   r  r   r_   c                 C   s   |dur|n| j j}| j|||||||||	|||d}|d }| |}| |}d}|
dur?t }||d| j|
d}|sU|f|dd  }|durS|f| S |S t|||j	|j
dS )a  
        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr/  r   r`   r   r0  )rR   r  r   r   r>  r
   r   r   r   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  r.  r   r  r   r   r;  r1  rE   r   r   r&   r&   r'   rf     s>   (

z!XLMForTokenClassification.forwardr3  )rK   rL   rM   rU   r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r=    sZ    	

r=  c                        r4  )XLMForMultipleChoicec                    sJ   t  j|g|R i | t|| _t|| _t|jd| _	| 
  d S rS   )rT   rU   r   r   r   r6  r   rV   r   logits_projr  )rZ   rR   r   r   r[   r&   r'   rU     s
   

zXLMForMultipleChoice.__init__Nr   r   r   r  r   r<   r   r   r  r.  r   r  r   r_   c                 C   s  |dur|n| j j}|dur|jd n|	jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dura|d|dnd}|	durt|	d|	d|	dnd}	|durtd d}| j|||||||||	|||d}|d }| |}| 	|}|d|}d}|
durt
 }|||
}|s|f|dd  }|dur|f| S |S t|||j|jdS )	a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the *language name
            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

            See usage examples detailed in the [multilingual documentation](../multilingual).
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
            `[0, ..., input_ids.size(-1)]`.
        cache (`Dict[str, torch.FloatTensor]`, *optional*):
            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
            decoding.

            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r`   rt   zrThe `lengths` parameter cannot be used with the XLM multiple choice models. Please use the attention mask instead.)r   r   r   r  r   r<   r   r   r  r   r  r   r   r0  )rR   r  ru   r   r9   loggerwarningr   r6  r@  r
   r   r]   r   )rZ   r   r   r   r  r   r<   r   r   r  r.  r   r  r   Znum_choicesr2  r   r1  Zreshaped_logitsrE   r   r&   r&   r'   rf     s^   B


zXLMForMultipleChoice.forwardr3  )rK   rL   rM   rU   r   r   r,   r  r   r  r   r   r   r   rf   rh   r&   r&   r[   r'   r?    sZ    
	

r?  )r?  r<  r7  r5  r=  r   r   r$  rg   )DrN   r   r   dataclassesr   typingr   r   r   r   r   numpyr    r,   r   Ztorch.nnr	   r
   r   Zactivationsr   r   Z
generationr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   r   Zconfiguration_xlmr   Z
get_loggerrK   rA  r1   rC   rD   ModulerQ   ri   r|   r   r   r   r   r   r   r   r   r$  r5  r7  r<  r=  r?  __all__r&   r&   r&   r'   <module>   sp    

&EApc`'* v*kmgz[ 