
    fTh0                       S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJrJrJr  SSKJrJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'J(r(  SSK)J*r*  \(RV                  " \,5      r-S r.S;S jr/\ " S S\&5      5       r0 " S S\Rb                  5      r2 " S S\Rb                  5      r3 " S S\Rb                  5      r4 " S S\Rb                  5      r5 " S S\Rb                  5      r6 " S S\Rb                  5      r7 " S  S!\Rb                  5      r8\' " S" S#\ 5      5       r9\ " S$ S%\&5      5       r:\' " S& S'\95      5       r; " S( S)\Rb                  5      r<\'" S*S+9 " S, S-\9\5      5       r=\'" S.S+9 " S/ S0\95      5       r>\'" S1S+9 " S2 S3\95      5       r?\' " S4 S5\95      5       r@\' " S6 S7\95      5       rA\' " S8 S9\95      5       rB/ S:QrCg)<z
PyTorch XLM model.
    N)	dataclass)CallableDictOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )geluget_activation)GenerationMixin)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )	XLMConfigc                    [         R                  " [        U 5       VVs/ s H?  n[        U5       Vs/ s H%  oC[         R                  " SSUS-  -  U-  5      -  PM'     snPMA     snn5      nSUl        [
        R                  " [         R                  " US S 2SS S24   5      5      US S 2SS S24'   [
        R                  " [         R                  " US S 2SS S24   5      5      US S 2SS S24'   UR                  5         g s  snf s  snnf )Ni'     Fr   r   )
nparrayrangepowerrequires_gradtorchFloatTensorsincosdetach_)n_posdimoutposjposition_encs         \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/xlm/modeling_xlm.pycreate_sinusoidal_embeddingsr3   0   s    88hmnshtuhtadQVWZQ[\Q[ABHHUAaL34F$GGQ[\htuvLC$$RVVLADqD,A%BCC14a4L$$RVVLADqD,A%BCC14a4LKKM	 ]us   D
,DD
D
c                    [         R                  " U [         R                  UR                  S9nUb  UnO/UR	                  5       R                  5       U ::  d   eXASS2S4   :  nUR                  S5      nU(       a&  USSSS24   R                  X`S5      USSS2S4   :*  nOUnUR                  5       X`4:X  d   eUSL d  UR                  5       X`U 4:X  d   eXW4$ )z@
Generate hidden states mask, and optionally an attention mask.
dtypedeviceNr   r   F)r'   arangelongr7   maxitemsizerepeat)slenlengthscausalpadding_maskalenmaskbs	attn_masks           r2   	get_masksrF   8   s     <<EJJw~~FD{{}!!#t+++ag&& 
aBtQ'..r;tD!TM?RR		 99;2*$$$U?inn.2T2BBBB?    c                   
   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   S
rg)XLMSquadHeadOutputQ   a  
Base class for outputs of question answering models using a [`~modeling_utils.XLMSQuADHead`].

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
        losses.
    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top config.start_n_top start token possibilities (beam-search).
    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
        (beam-search).
    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the `is_impossible` label of the answers.

Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits )__name__
__module____qualname____firstlineno____doc__rK   r   r'   r(   __annotations__rL   rM   
LongTensorrN   rO   rP   __static_attributes__rQ   rG   r2   rI   rI   Q   s    * )-D(5$$
%,7;%"3"34;26OXe../659x 1 12904M8E,,-4.2J**+2rG   rI   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjr
S	rU =r$ )XLMPoolerStartLogitsp   z
Compute SQuAD start logits from sequence hidden states.

Args:
    config ([`XLMConfig`]):
        The config used by the model, will be used to grab the `hidden_size` of the model.
configc                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr   )super__init__r	   Linearhidden_sizedenseselfr]   	__class__s     r2   ra   XLMPoolerStartLogits.__init__y   s&    YYv1115
rG   hidden_statesp_maskreturnc                     U R                  U5      R                  S5      nUb<  UR                  [        R                  :X  a  USU-
  -  SU-  -
  nU$ USU-
  -  SU-  -
  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
        The final hidden states of the model.
    p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
        Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
        should be masked.

Returns:
    `torch.FloatTensor`: The start logits for SQuAD.
r     ꌠ9Y>)F)rd   squeezer6   r'   float16)rf   ri   rj   xs       r2   forwardXLMPoolerStartLogits.forward}   so     JJ}%--b1||u}},V$uv~5  V$tf}4rG   )rd   N)rR   rS   rT   rU   rV   r   ra   r'   r(   r   rs   rY   __classcell__rg   s   @r2   r[   r[   p   sT    6y 6
 W["..8@ARAR8S			 rG   r[   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                     S\	\R                     S\	\R                     S	\R                  4
S
 jjrSrU =r$ )XLMPoolerEndLogits   z
Compute SQuAD end logits from sequence hidden states.

Args:
    config ([`XLMConfig`]):
        The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
        to use.
r]   c                 d  > [         TU ]  5         [        R                  " UR                  S-  UR                  5      U l        [        R                  " 5       U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  S5      U l
        g )Nr!   epsr   )r`   ra   r	   rb   rc   dense_0Tanh
activation	LayerNormlayer_norm_epsdense_1re   s     r2   ra   XLMPoolerEndLogits.__init__   st    yy!3!3a!79K9KL'')f&8&8f>S>STyy!3!3Q7rG   ri   start_statesstart_positionsrj   rk   c                    Uc
  Uc   S5       eUbQ  UR                   SS u  pVUSS2SS4   R                  SSU5      nUR                  SU5      nUR                  SUS5      nU R                  [        R
                  " X/SS95      nU R                  U5      nU R                  U5      nU R                  U5      R                  S5      nUb<  UR                  [        R                  :X  a  USU-
  -  SU-  -
  nU$ USU-
  -  SU-  -
  nU$ )	ac  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
        The final hidden states of the model.
    start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
        The hidden states of the first tokens for the labeled span.
    start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        The position of the first token for the labeled span.
    p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
        Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
        should be masked.

<Tip>

One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
`start_states`.

</Tip>

Returns:
    `torch.FloatTensor`: The end logits for SQuAD.
N7One of start_states, start_positions should be not Nonerm   r-   r   rn   ro   )shapeexpandgatherr~   r'   catr   r   r   rp   r6   rq   )rf   ri   r   r   rj   r>   hszrr   s           r2   rs   XLMPoolerEndLogits.forward   s"   : '?+F 	
E	
F &%++BC0ID-atm<CCBCPO(//ODL'..r4<LLLM#@bIJOOANN1LLO##B'||u}},V$uv~5  V$tf}4rG   )r   r   r~   r   NNNrR   rS   rT   rU   rV   r   ra   r'   r(   r   rX   rs   rY   rv   rw   s   @r2   ry   ry      s    8y 8 596:.21((1 u0011 "%"2"23	1
 **+1 
		1 1rG   ry   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                     S\	\R                     S\	\R                     S	\R                  4
S
 jjrSrU =r$ )XLMPoolerAnswerClass   z
Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

Args:
    config ([`XLMConfig`]):
        The config used by the model, will be used to grab the `hidden_size` of the model.
r]   c                   > [         TU ]  5         [        R                  " UR                  S-  UR                  5      U l        [        R                  " 5       U l        [        R                  " UR                  SSS9U l        g )Nr!   r   Fbias)	r`   ra   r	   rb   rc   r~   r   r   r   re   s     r2   ra   XLMPoolerAnswerClass.__init__   sX    yy!3!3a!79K9KL'')yy!3!3QUCrG   ri   r   r   	cls_indexrk   c                     UR                   S   nUc
  Uc   S5       eUb<  USS2SS4   R                  SSU5      nUR                  SU5      R                  S5      nUb=  USS2SS4   R                  SSU5      nUR                  SU5      R                  S5      nOUSS2SSS24   nU R	                  [
        R                  " X&/SS95      nU R                  U5      nU R                  U5      R                  S5      nU$ )a8  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
        The final hidden states of the model.
    start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
        The hidden states of the first tokens for the labeled span.
    start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        The position of the first token for the labeled span.
    cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

<Tip>

One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
`start_states`.

</Tip>

Returns:
    `torch.FloatTensor`: The SQuAD 2.0 answer class.
rm   Nr   r   r   )	r   r   r   rp   r~   r'   r   r   r   )rf   ri   r   r   r   r   cls_token_staterr   s           r2   rs   XLMPoolerAnswerClass.forward   s   : !!"%'?+F 	
E	
F &-atm<CCBCPO(//ODLLRPL !!T4-077BDI+222yAII"MO+Ar1H5OLLL#BKLOOALLO##B'rG   )r   r~   r   r   r   rw   s   @r2   r   r      s    Dy D 596:04/((/ u001/ "%"2"23	/
 E,,-/ 
		/ /rG   r   c                   2  ^  \ rS rSrSrS\4U 4S jjr\      SS\R                  S\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\S\\\\R                     4   4S jj5       rSrU =r$ )XLMSQuADHeadi  z
A SQuAD head inspired by XLNet.

Args:
    config ([`XLMConfig`]):
        The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
        to use.
r]   c                    > [         TU ]  5         UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        [        U5      U l	        g ru   )
r`   ra   start_n_top	end_n_topr[   start_logitsry   
end_logitsr   answer_classre   s     r2   ra   XLMSQuADHead.__init__&  sO    !--))08,V408rG   ri   r   end_positionsr   is_impossiblerj   return_dictrk   c                 8   U R                  XS9nUb  Ub  X#XE4 H/  n	U	c  M  U	R                  5       S:  d  M  U	R                  S5        M1     U R                  XUS9n
[	        5       nU" X5      nU" X5      nX-   S-  nUb8  Ub5  U R                  XUS9n[        R                  " 5       nU" X5      nUUS-  -  nU(       a	  [        US	9$ U4$ UR                  5       u  nnn[        R                  R                  USS
9n[        R                  " UU R                  SS
9u  nnUR                  S5      R!                  SSU5      n[        R"                  " USU5      nUR                  S5      R!                  SUSS5      nUR                  S5      R%                  U5      nUb  UR                  S5      OSnU R                  UUUS9n
[        R                  R                  U
SS
9n[        R                  " UU R&                  SS
9u  nnUR)                  SU R                  U R&                  -  5      nUR)                  SU R                  U R&                  -  5      n[        R*                  " SUU5      nU R                  UUUS9nU(       d  UUUUU4$ [        UUUUUS9$ )a  
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
    Final hidden states of the model on the sequence tokens.
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Positions of the first token for the labeled span.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Positions of the last token for the labeled span.
cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Whether the question has a possible answer in the paragraph or not.
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
    Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
    should be masked.
)rj   Nr   rm   )r   rj   r!   )r   r   g      ?)rK   r   r   )r   rj   z
blh,bl->bh)r   r   )rL   rM   rN   rO   rP   )r   r-   squeeze_r   r   r   r	   r
   rI   r<   
functionalsoftmaxr'   topkr   	unsqueezer   r   	expand_asr   vieweinsum)rf   ri   r   r   r   r   rj   r   r   rr   r   loss_fct
start_lossend_loss
total_lossrP   loss_fct_clscls_lossbszr>   r   start_log_probsrL   rM   start_top_index_expr   hidden_states_expandedend_log_probsrN   rO   s                                 r2   rs   XLMSQuADHead.forward/  s   4 (((F&=+D%iO=QUUWq[JJrN P
 `fgJ')H!,@J
:H$/14J$)B!..}ir.s
!335'
B hn,
:E%:6XJ=X +//1NCs mm33Lb3IO38::!1!1r40 #2";";B"?"F"Fr2s"S <<r;NOL'11!4;;Bb"ML%2%<%<Q%?%I%I&" .4-?V%%b)TF)?lcijJMM11*!1DM/4zzt~~10,} !2 6 6r4;K;Kdnn;\ ])..r43C3Cdnn3TUM <<m_UL**=|_h*iJ+_>OQ^`jkk)(;$3&7"/) rG   )r   r   r   r   r   )NNNNNF)rR   rS   rT   rU   rV   r   ra   r   r'   r(   r   rX   boolr   rI   r   rs   rY   rv   rw   s   @r2   r   r     s    9y 9  7;480448.2!Y((Y "%"2"23Y   0 01	Y
 E,,-Y   0 01Y **+Y Y 
!5):):#;;	<Y YrG   r   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjrS	rU =r$ )XLMSequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`XLMConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
r]   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r`   ra   getattrr   NotImplementedErrorr	   Identitysummaryhasattrr   r   
num_labelsrc   rb   r   r   first_dropoutr   Dropoutlast_dropoutr   )rf   r]   num_classesactivation_stringrg   s       r2   ra   XLMSequenceSummary.__init__  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2rG   ri   r   rk   c                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
r   Nrm   firstr   meanr   r   r   .r   )r6   )rm   r   )r   r   r'   	full_liker   r9   r   r   r-   r<   r   rp   r   r   r   r   r   )rf   ri   r   outputs       r2   rs   XLMSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rG   )r   r   r   r   r   ru   r   rw   s   @r2   r   r     sV    2Hy H< Y])"..);CEDTDT;U)			) )rG   r   c                   Z   ^  \ rS rSr\R
                  " 5       rU 4S jrS rSS jr	Sr
U =r$ )MultiHeadAttentioni  c                   > [         TU ]  5         [        [        R                  5      U l        X l        Xl        UR                  U l	        U R                  U R                  -  S:X  d   e[        R                  " X"5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l        [        R                  " X"5      U l        [!        5       U l        g )Nr   )r`   ra   nextr   NEW_IDlayer_idr-   n_headsattention_dropoutdropoutr	   rb   q_link_linv_linout_linsetpruned_heads)rf   r   r-   r]   rg   s       r2   ra   MultiHeadAttention.__init__  s    /667//xx$,,&!+++YYs(
YYs(
YYs(
yy*ErG   c                    U R                   U R                  -  n[        U5      S:X  a  g [        XR                  X R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        X R                  -  U l         U R                  R                  U5      U l        g )Nr   r   r   )r-   r   lenr   r   r   r   r   r   r   union)rf   headsattention_head_sizeindexs       r2   prune_headsMultiHeadAttention.prune_heads   s    "hh$,,6u:?7||M`bsbst'

E:
'

E:
'

E:
)$,,1E||c%j0&5 --33E:rG   c                   ^ ^^ UR                  5       u  mpxUc  Uc  UOUS   U-   n	OUR                  S5      n	T R                  n
T R                  U
-  mUR                  5       S:X  a  TSXy4OTSSU	4nUUU 4S jnUUU 4S jnU" T R                  U5      5      nUc/  U" T R	                  U5      5      nU" T R                  U5      5      nOEUb  T R                  U;  a2  U=nnU" T R	                  U5      5      nU" T R                  U5      5      nUbw  T R                  U;   aV  UcA  UT R                     u  nn[        R                  " UW/SS9n[        R                  " UW/SS9nOUT R                     u  nnWW4UT R                  '   U[        R                  " T5      -  n[        R                  " UWR                  SS5      5      nUS:H  R                  U5      R                  U5      nUR                  U[        R                   " UR"                  5      R$                  5        [&        R(                  R+                  UR-                  5       S	S9R/                  U5      n[&        R(                  R1                  UT R0                  T R2                  S
9nUb  UU-  n[        R                  " UW5      nU" U5      nT R5                  U5      4nU(       a  UU4-   nU$ )zT
Self-attention (if kv is None) or attention over source sentence (provided by kv).
r>   r   r   c                 `   > U R                  TSTR                  T5      R                  SS5      $ )
projectionrm   r   r!   )r   r   	transposerr   rD   dim_per_headrf   s    r2   r   )MultiHeadAttention.forward.<locals>.shape  s)    66"b$,,=GG1MMrG   c                    > U R                  SS5      R                  5       R                  TSTR                  T-  5      $ )zcompute contextr   r!   rm   )r   
contiguousr   r   r   s    r2   unshape+MultiHeadAttention.forward.<locals>.unshape#  s5    ;;q!$//166r2t||l?Z[[rG   r!   r   r   rm   ptraining)r<   r   r-   r   r   r   r   r'   r   mathsqrtmatmulr   r   r   masked_fill_finfor6   minr	   r   r   floattype_asr   r  r   )rf   inputrC   kvcache	head_maskoutput_attentionsqlenr-   klenr   mask_reshaper   r   qkvk_v_scoresweightscontextoutputsrD   r   s   `                      @@r2   rs   MultiHeadAttention.forward  s    

D: =4eFmd.BD771:D,,xx7*.2hhjAoAt*B1dCS	N	\ $**U#$:djj'(Adjj'(A]dmm58JAdjjm$Adjjm$A}}%:"4==1FB		2q'q1A		2q'q1A /DAq$%q6E$-- 		,''aQ!23	-77?D%++fll";"?"?@--''B'?GGO--''4<<$--'X  	)G,,w*'"<<(*
*GrG   )	r-   r   r   r   r   r   r   r   r   )NNNF)rR   rS   rT   rU   	itertoolscountr   ra   r   rs   rY   rv   rw   s   @r2   r   r     s#    __F";= =rG   r   c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )TransformerFFNiO  c                 L  > [         TU ]  5         UR                  U l        [        R                  " X5      U l        [        R                  " X#5      U l        UR                  (       a  [        O[        R                  R                  U l        UR                  U l        SU l        g r_   )r`   ra   r   r	   rb   lin1lin2gelu_activationr   r   reluactchunk_size_feed_forwardseq_len_dim)rf   in_dim
dim_hiddenout_dimr]   rg   s        r2   ra   TransformerFFN.__init__P  sj    ~~IIf1	IIj2	!114r}}7I7I'-'E'E$rG   c                 Z    [        U R                  U R                  U R                  U5      $ ru   )r   ff_chunkr'  r(  )rf   r  s     r2   rs   TransformerFFN.forwardY  s%    (8T8TVZVfVfhmnnrG   c                     U R                  U5      nU R                  U5      nU R                  U5      n[        R                  R                  X R
                  U R                  S9nU$ )Nr   )r"  r&  r#  r	   r   r   r  )rf   r  rr   s      r2   r.  TransformerFFN.ff_chunk\  sP    IIeHHQKIIaLMM!!!||dmm!LrG   )r&  r'  r   r"  r#  r(  )	rR   rS   rT   rU   ra   rs   r.  rY   rv   rw   s   @r2   r   r   O  s    o rG   r   c                   J   ^  \ rS rSr\rSrSrU 4S jr\	S 5       r
S rSrU =r$ )XLMPreTrainedModelid  Ntransformerc                 &   > [         TU ]  " U0 UD6  g ru   )r`   ra   )rf   inputskwargsrg   s      r2   ra   XLMPreTrainedModel.__init__j  s    &+F+rG   c                 6   [         R                  " / SQ/ SQ/ SQ/5      n[         R                  " / SQ/ SQ/ SQ/5      nU R                  R                  (       a:  U R                  R                  S:  a   [         R                  " / SQ/ SQ/ SQ/5      nOS nXUS.$ )	N)      r   r   r   )r   r!   r   r   r   )r   r   r         )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )	input_idsattention_masklangs)r'   tensorr]   use_lang_embn_langs)rf   inputs_list
attns_list
langs_lists       r2   dummy_inputsXLMPreTrainedModel.dummy_inputsm  sp    llO_o#VW\\?O_"UV
;;##(;(;a(?&YZJJ(Q[\\rG   c                    [        U[        R                  5      (       a  U R                  bT  U R                  R                  b=  [        R
                  R                  UR                  SU R                  R                  S9  UR                  b1  UR                  R                  UR                     R                  5         [        U[        R                  5      (       a  U R                  b  U R                  R                  bt  [        R
                  R                  UR                  SU R                  R                  S9  UR                  b*  [        R
                  R                  UR                  S5        [        U[        R                  5      (       aI  UR                  R                  R                  5         UR                  R                  R!                  S5        [        U["        5      (       ad  U R                  R$                  (       aH  ['        U R                  R(                  U R                  R*                  UR,                  R                  S9  ggg)zInitialize the weights.Nr   )r   stdg        g      ?)r.   )
isinstancer	   	Embeddingr]   embed_init_stdinitnormal_weightpadding_idxdatazero_rb   init_stdr   	constant_r   fill_XLMModelsinusoidal_embeddingsr3   max_position_embeddingsemb_dimposition_embeddings)rf   modules     r2   _init_weights XLMPreTrainedModel._init_weightsw  s}   fbll++{{&4;;+E+E+QA4;;;U;UV!!-""6#5#56<<>fbii(({{&4;;+?+?+KA4;;;O;OP;;*GG%%fkk37fbll++KK""$MM$$S)fh''DKK,M,M(33T[[5H5HfNhNhNoNo -N'rG   rQ   )rR   rS   rT   rU   r   config_classload_tf_weightsbase_model_prefixra   propertyrG  r]  rY   rv   rw   s   @r2   r3  r3  d  s8    LO%, ] ] rG   r3  c                   n   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\\R                  S
4      \	S'   Sr\\\R                  S
4      \	S'   Srg)XLMForQuestionAnsweringOutputi  a	  
Base class for outputs of question answering models using a `XLMSQuADHead`.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
        losses.
    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top config.start_n_top start token possibilities (beam-search).
    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
        (beam-search).
    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
        Log probabilities for the `is_impossible` label of the answers.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
NrK   rL   rM   rN   rO   rP   .ri   
attentionsrQ   )rR   rS   rT   rU   rV   rK   r   r'   r(   rW   rL   rM   rX   rN   rO   rP   ri   r   re  rY   rQ   rG   r2   rd  rd    s    > )-D(5$$
%,7;%"3"34;26OXe../659x 1 12904M8E,,-4.2J**+2=AM8E%"3"3S"89:A:>Ju00#567>rG   rd  c                     ^  \ rS rSrU 4S jrS rS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\\\
R                  4      S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )rW  i  c           
      f	  > [         TU ]  U5        UR                  U l        UR                  (       + U l        U R                  (       a  [	        S5      eUR
                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  S-  U l        UR                  U l        UR                  U l        UR                   U l        UR"                  U l        U R                  U R                  -  S:X  d   S5       e[$        R&                  " UR(                  U R                  5      U l        UR                  S:  aA  UR                  (       a0  [$        R&                  " U R                  U R                  5      U l        [$        R&                  " U R                  U R                  U R                  S9U l        [$        R0                  " U R                  UR2                  S9U l        [$        R6                  " 5       U l        [$        R6                  " 5       U l        [$        R6                  " 5       U l        [$        R6                  " 5       U l        [A        U R                  5       GH  nU R8                  RC                  [E        U R                  U R                  US95        U R:                  RC                  [$        R0                  " U R                  UR2                  S95        U R<                  RC                  [G        U R                  U R                  U R                  US95        U R>                  RC                  [$        R0                  " U R                  UR2                  S95        GM     [I        US	5      (       a  URJ                  RM                  5       RO                  5       n0 Ul%        U Hj  u  pEU R8                  [Q        U5         R                  UR                  :X  d  M7  U RS                  [Q        U5      [U        [W        [P        U5      5      05        Ml     U RY                  5         U R[                  S
[\        R^                  " UR(                  5      Ra                  S5      SS9  g )Nz,Currently XLM can only be used as an encoderr<  r   z-transformer dim must be a multiple of n_headsr   )rQ  r|   )r]   r   position_ids)r   rm   F)
persistent)1r`   ra   
is_encoder
is_decoderr   r@   rC  rB  n_words	eos_index	pad_indexrZ  r-   
hidden_dimr   n_layersr   r   r	   rL  rY  r[  lang_embeddings
embeddingsr   r   layer_norm_emb
ModuleListre  layer_norm1ffnslayer_norm2r$   appendr   r   r   r   copyitemsintr   listmap	post_initregister_bufferr'   r8   r   )rf   r]   _r   layerr   rg   s         r2   ra   XLMModel.__init__  s:     !++$///??%&TUUmm ~~"//~~)))) >>((Q,~~~~!'!9!9xx$,,&!+\-\\+ $&<<0N0NPTPXPX#Y >>A&"5"5#%<<dhh#GD ,,t||TXX4>>Z ll4889N9NO --/==?MMO	==?
 t}}%AOO""#5dllDHHU[#\]##BLLv?T?T$UV II^DHHdootxxX^_`##BLLv?T?T$UV & 6>**!..335;;=L"$F ,??3u:.66&..H$$c%j$s32G%HI !-
 	ELL)G)GHOOPWXej 	 	
rG   c                     U R                   $ ru   rr  rf   s    r2   get_input_embeddingsXLMModel.get_input_embeddings  s    rG   c                     Xl         g ru   r  rf   new_embeddingss     r2   set_input_embeddingsXLMModel.set_input_embeddings  s    (rG   c                 r    UR                  5        H#  u  p#U R                  U   R                  U5        M%     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)rz  re  r   )rf   heads_to_pruner  r   s       r2   _prune_headsXLMModel._prune_heads  s0    
 +002LEOOE"..u5 3rG   r>  r?  r@  token_type_idsrh  r?   r  r  inputs_embedsr  output_hidden_statesr   rk   c           	         U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UR	                  5       u  pOU	R	                  5       SS u  pUb  UR
                  OU	R
                  nUcF  Ub*  XR                  :g  R                  SS9R                  5       nO[        R                  " U/U-  US9nUR	                  S5      U:X  d   eUR                  5       R                  5       U::  d   e[        XU R                  US9u  nnUc  U R                  SS2SU24   nOUR	                  5       X4:X  d   eUb  UR	                  5       X4:X  d   eU R!                  XR                   R"                  5      nUbI  UbF  XS   -
  nUSS2U* S24   nUSS2U* S24   nUb  USS2U* S24   nUSS2U* S24   nUSS2U* S24   nU	c  U R%                  U5      n	XR'                  U5      R)                  U	5      -   nUb5  U R*                  (       a$  U R,                  S:  a  UU R/                  U5      -   nUb  UU R%                  U5      -   nU R1                  U5      n[2        R4                  R7                  UU R6                  U R8                  S	9nUUR;                  S5      R=                  UR>                  5      -  nU(       a  S
OSnU
(       a  S
OSn[A        U R"                  5       H  nU(       a  UU4-   nU RB                  U   " UUUUU   U
S9nUS   nU
(       a	  UUS   4-   n[2        R4                  R7                  UU R6                  U R8                  S	9nUU-   nU RD                  U   " U5      nUU RF                  U   " U5      -   nU RH                  U   " U5      nUUR;                  S5      R=                  UR>                  5      -  nM     U(       a  UU4-   nUb  US==   UR	                  S5      -  ss'   U(       d  [K        S UUU4 5       5      $ [M        UUUS9$ )  
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
Nrm   r   r   )r7   r   )rA   r>   r   rQ   )r  r  r  c              3   .   #    U  H  oc  M  Uv   M     g 7fru   rQ   ).0r  s     r2   	<genexpr>#XLMModel.forward.<locals>.<genexpr>  s     Y$Gq$Gs   	)last_hidden_stateri   re  )'r]   r  r  use_return_dictr<   r7   rn  sumr9   r'   rA  r:   r;   rF   r@   rh  get_head_maskrp  rr  r[  r   rB  rC  rq  rs  r	   r   r   r  r   tor6   r$   re  ru  rv  rw  tupler   )rf   r>  r?  r@  r  rh  r?   r  r  r  r  r  r   r7  rD   r>   r7   rC   rE   _slenrA  ri   re  iattn_outputsr   s                             r2   rs   XLMModel.forward  sQ   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ~~'HB$))+CR0HB%.%:!!@T@T?$$6;;;BGGI,,v{6B ||A"$$${{}!!#t+++ $D4;;^\i
 ,,QX6L$$&2*444 ::<B:--- &&y++2F2FG	 !6=(E!!eVW*-I'E67
3L a%j)E67
#D!!eVW*-I   OOI6M!9!9,!G!Q!QR_!``!2!2t||a7Gd22599F%doon==F$$V,&&v&V$..$''55 3,R$
t}}%A# -	 9  ??1-#A,"3L  ?D '<?*<<
==(((VDd]F%%a(0F diil622F%%a(0FdnnR(++FLL99F; &@  )VI5M &MV[[^+M
 YV]J$GYYY}akllrG   )r   re  r@   r-   r   rr  rm  rv  ro  rk  rj  rq  ru  rw  rs  r   rC  rp  rl  rn  r[  rB  )NNNNNNNNNNNN)rR   rS   rT   rU   ra   r  r  r  r   r   r'   Tensorr   strr   r   r   r   rs   rY   rv   rw   s   @r2   rW  rW    sb   D
L)6  -115(,15/3*.37,004,0/3&*\mELL)\m !.\m %	\m
 !.\m u||,\m %,,'\m S%,,./0\m ELL)\m  -\m $D>\m 'tn\m d^\m 
uo%	&\m \mrG   rW  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )XLMPredLayeri  z7
Prediction layer (cross_entropy or adaptive_softmax).
c                 ~  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  nUR                  SL a%  [        R                  " X!R                  SS9U l        g [        R                  " UUR                  UR                  UR                  SS9U l        g )NFTr   )in_features	n_classescutoffs	div_value	head_bias)r`   ra   asmrl  rn  rZ  r	   rb   projAdaptiveLogSoftmaxWithLossasm_cutoffsasm_div_value)rf   r]   r-   rg   s      r2   ra   XLMPredLayer.__init__  s    ::~~))nn::		#~~DADI55 ..** ..DIrG   c                 v   SnU R                   SL aj  U R                  U5      nU4U-   nUbN  [        R                  R	                  UR                  SU R                  5      UR                  S5      SS9nU4U-   nU$ U R                  R                  U5      nU4U-   nUb  U R                  X5      u  peU4U-   nU$ )z,Compute the loss, and optionally the scores.rQ   Frm   r   )	reduction)r  r  r	   r   cross_entropyr   rl  log_prob)rf   rr   yr  r  rK   r  s          r2   rs   XLMPredLayer.forward  s    88uYYq\Fi')G}}}226;;r4<<3PRSRXRXY[R\hn2o'G+  YY''*Fi')G}))A/'G+rG   )r  rl  rn  r  ru   )	rR   rS   rT   rU   rV   ra   rs   rY   rv   rw   s   @r2   r  r    s    $ rG   r  z
    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                      ^  \ rS rSrS/rU 4S jrS rS rS r\	             SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\\\R                  4      S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )XLMWithLMHeadModeli  zpred_layer.proj.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g ru   )r`   ra   rW  r4  r  
pred_layerr~  re   s     r2   ra   XLMWithLMHeadModel.__init__  s5     #F+&v. 	rG   c                 .    U R                   R                  $ ru   r  r  r  s    r2   get_output_embeddings(XLMWithLMHeadModel.get_output_embeddings  s    ###rG   c                 $    XR                   l        g ru   r  r  s     r2   set_output_embeddings(XLMWithLMHeadModel.set_output_embeddings  s    -rG   c                 F   U R                   R                  nU R                   R                  nUR                  S   n[        R
                  " US4U[        R                  UR                  S9n[        R                  " X/SS9nUb  [        R                  " X5      nOS nXS.$ )Nr   r   r5   r   )r>  r@  )
r]   mask_token_idlang_idr   r'   fullr9   r7   r   r   )rf   r>  r7  r  r  effective_batch_size
mask_tokenr@  s           r2   prepare_inputs_for_generation0XLMWithLMHeadModel.prepare_inputs_for_generation  s     11++%%(q1ZZ!5q 9=PUPZPZclcscst
IIy51=	OOI7EE&77rG   r>  r?  r@  r  rh  r?   r  r  r  labelsr  r  r   rk   c                 (   Ub  UOU R                   R                  nU R                  " U4UUUUUUUU	UUUS.UD6nUS   nU R                  UU
5      nU(       d  UUSS -   $ [	        U
b  US   OSU
c  US   OUS   UR
                  UR                  S9$ )a  
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Nr?  r@  r  rh  r?   r  r  r  r  r  r   r   r   rK   logitsri   re  )r]   r  r4  r  r   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   r7  transformer_outputsr   r  s                     r2   rs   XLMWithLMHeadModel.forward  s    V &1%<k$++B]B]"..
))%'/!5#
 
  %Q'//&&10444%1t!'71:WQZ-;;*55	
 	
rG   )r  r4  NNNNNNNNNNNNN)rR   rS   rT   rU   _tied_weights_keysra   r  r  r  r   r   r'   r  r   r  r   r   r   r   rs   rY   rv   rw   s   @r2   r  r    ss    33$.8  -115(,15/3*.37,004)-,0/3&*G
ELL)G
 !.G
 %	G

 !.G
 u||,G
 %,,'G
 S%,,./0G
 ELL)G
  -G
 &G
 $D>G
 'tnG
 d^G
  
un$	%!G
 G
rG   r  z
    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\
\R                  4      S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )XLMForSequenceClassificationiH  c                    > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        U R                  5         g ru   )	r`   ra   r   r]   rW  r4  r   sequence_summaryr~  re   s     r2   ra   %XLMForSequenceClassification.__init__O  sH      ++#F+ 26 : 	rG   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   rk   c                 :   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUS9nUS   nU R                  U5      nSnU
Gb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  U
R                  [        R                  :X  d  U
R                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aJ  [        5       nU R
                  S:X  a&  U" UR                  5       U
R                  5       5      nOU" UU
5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      U
R                  S5      5      nO-U R                   R                  S:X  a  [        5       nU" UU
5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S	9$ )
a  
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationrm   r  )r]   r  r4  r  problem_typer   r6   r'   r9   r{  r   rp   r   r   r
   r   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   r  r   r  rK   r   s                      r2   rs   $XLMForSequenceClassification.forwardZ  s   T &1%<k$++B]B]"..))%'/!5# / 
 %Q'&&v.{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE'-;;*55	
 	
rG   )r]   r   r  r4  r  )rR   rS   rT   rU   ra   r   r   r'   r  r   r  r   r   r   r   rs   rY   rv   rw   s   @r2   r  r  H  sZ   	  -115(,15/3*.37,004)-,0/3&*]
ELL)]
 !.]
 %	]

 !.]
 u||,]
 %,,']
 S%,,./0]
 ELL)]
  -]
 &]
 $D>]
 'tn]
 d^]
 
u..	/]
 ]
rG   r  z
    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c            "         ^  \ rS rSrU 4S jr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\
\R                  4      S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )XLMForQuestionAnsweringSimplei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g ru   )
r`   ra   rW  r4  r	   rb   rc   r   
qa_outputsr~  re   s     r2   ra   &XLMForQuestionAnsweringSimple.__init__  sG     #F+))F$6$68I8IJ 	rG   r>  r?  r@  r  rh  r?   r  r  r  r   r   r  r  r   rk   c                 ,   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUS9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnU
b  Ub  [        U
R                  5       5      S:  a  U
R                  S5      n
[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nU
R                  SU5      n
UR                  SU5      n[        US9nU" UU
5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
r  Nr  r   r   rm   r   )ignore_indexr!   )rK   r   r   ri   re  )r]   r  r4  r  splitrp   r   r   r<   clampr   r   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r   r   r  r  r   r  sequence_outputr  r   r   r   ignored_indexr   r   r   r   s                             r2   rs   %XLMForQuestionAnsweringSimple.forward  s   N &1%<k$++B]B]"..))%'/!5# / 
 .a01#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/2Eab2IIF/9/EZMF*Q6Q+%!-;;*55
 	
rG   r  r4  )NNNNNNNNNNNNNN)rR   rS   rT   rU   ra   r   r   r'   r  r   r  r   r   r   r   rs   rY   rv   rw   s   @r2   r  r    sq     -115(,15/3*.37,0042604,0/3&*Y
ELL)Y
 !.Y
 %	Y

 !.Y
 u||,Y
 %,,'Y
 S%,,./0Y
 ELL)Y
  -Y
 "%,,/Y
  -Y
 $D>Y
 'tnY
 d^Y
  
u22	3!Y
 Y
rG   r  c            (       <  ^  \ rS rSrU 4S jr\                 SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\
\R                  4      S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4$S jj5       rSrU =r$ )XLMForQuestionAnsweringi(  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g ru   )r`   ra   rW  r4  r   r  r~  re   s     r2   ra    XLMForQuestionAnswering.__init__*  s5     #F+&v. 	rG   r>  r?  r@  r  rh  r?   r  r  r  r   r   r   r   rj   r  r  r   rk   c                 ~   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUS9nUS   nU R                  UU
UUUUUS9nU(       d  UUSS -   $ [	        UR
                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )a%
  
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the classification token to use as input for computing plausibility of the
    answer.
p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
    masked. 0.0 mean token is not masked.

Example:

```python
>>> from transformers import AutoTokenizer, XLMForQuestionAnswering
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
>>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")

>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
...     0
... )  # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])

>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
```Nr  r   )r   r   r   r   rj   r   r   )rK   rL   rM   rN   rO   rP   ri   re  )r]   r  r4  r  rd  rK   rL   rM   rN   rO   rP   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r   r   r   r   rj   r  r  r   r  r   r  s                        r2   rs   XLMForQuestionAnswering.forward3  s    H &1%<k$++B]B]"..))%'/!5# / 
 %Q'//+''# " 
 0444, ' ; ;#33%77!//))-;;*55	
 		
rG   r  )NNNNNNNNNNNNNNNNN)rR   rS   rT   rU   ra   r   r   r'   r  r   r  r   r   r   rd  rs   rY   rv   rw   s   @r2   r  r  (  s     -115(,15/3*.37,004260404,0)-,0/3&*%l
ELL)l
 !.l
 %	l

 !.l
 u||,l
 %,,'l
 S%,,./0l
 ELL)l
  -l
 "%,,/l
  -l
  -l
 ELL)l
 &l
  $D>!l
" 'tn#l
$ d^%l
& 
u33	4'l
 l
rG   r  c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\
\R                  4      S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )XLMForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l
        U R                  5         g ru   )r`   ra   r   rW  r4  r	   r   r   rb   rc   
classifierr~  re   s     r2   ra   "XLMForTokenClassification.__init__  sh      ++#F+zz&..1))F$6$68I8IJ 	rG   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   rk   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUS9nUS   nU R                  U5      nU R	                  U5      nSnU
b<  [        5       nU" UR                  SU R                  5      U
R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )aV  
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   rm   r   r  )r]   r  r4  r   r  r   r   r   r   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   r  r  r  rK   r   r   s                       r2   rs   !XLMForTokenClassification.forward  s   P &1%<k$++B]B]""))%'/!5# # 
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rG   )r  r   r   r4  r  )rR   rS   rT   rU   ra   r   r   r'   r  r   r  r   r   r   r   rs   rY   rv   rw   s   @r2   r  r    sZ   	  -115(,15/3*.37,004)-,0/3&*K
ELL)K
 !.K
 %	K

 !.K
 u||,K
 %,,'K
 S%,,./0K
 ELL)K
  -K
 &K
 $D>K
 'tnK
 d^K
 
u++	,K
 K
rG   r  c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\
\R                  4      S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )XLMForMultipleChoicei  c                    > [         TU ]  " U/UQ70 UD6  [        U5      U l        [	        U5      U l        [        R                  " UR                  S5      U l	        U R                  5         g r_   )r`   ra   rW  r4  r   r  r	   rb   r   logits_projr~  )rf   r]   r6  r7  rg   s       r2   ra   XLMForMultipleChoice.__init__  sY    3&3F3#F+ 26 :99V%6%6: 	rG   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   rk   c                    Ub  UOU R                   R                  nUb  UR                  S   OU	R                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnU	b1  U	R                  SU	R	                  S5      U	R	                  S5      5      OSn	Ub  [
        R                  S5        SnU R                  UUUUUUUUU	UUUS9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnU
b  [        5       nU" UU
5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )	a;  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
    languages ids which can be obtained from the language names by using two conversion mappings provided in
    the configuration of the model (only provided for multilingual models). More precisely, the *language name
    to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
    *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).

    See usage examples detailed in the [multilingual documentation](../multilingual).
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Length of each sentence that can be used to avoid performing attention on padding token indices. You can
    also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
    `[0, ..., input_ids.size(-1)]`.
cache (`Dict[str, torch.FloatTensor]`, *optional*):
    Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
    attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
    decoding.

    The dictionary object will be modified in-place during the forward pass to add newly computed
    hidden-states.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   rm   r   zrThe `lengths` parameter cannot be used with the XLM multiple choice models. Please use the attention mask instead.)r>  r?  r@  r  rh  r?   r  r  r  r  r  r   r   r  )r]   r  r   r   r<   loggerwarningr4  r  r   r   r   ri   re  )rf   r>  r?  r@  r  rh  r?   r  r  r  r  r  r  r   num_choicesr  r   r  reshaped_logitsrK   r   s                        r2   rs   XLMForMultipleChoice.forward  s.   D &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei272C

2uzz"~. ( r=#5#5b#9=;M;Mb;QR 	 NN* G"..))%'/!5# / 
 %Q'&&v.!!&) ++b+6')HOV4D%'*=ab*AAF)-)9TGf$EvE("-;;*55	
 	
rG   )r   r  r4  r  )rR   rS   rT   rU   ra   r   r   r'   r  r   r  r   r   r   r   rs   rY   rv   rw   s   @r2   r  r    sZ     -115(,15/3*.37,004)-,0/3&*w
ELL)w
 !.w
 %	w

 !.w
 u||,w
 %,,'w
 S%,,./0w
 ELL)w
  -w
 &w
 $D>w
 'tnw
 d^w
 
u//	0w
 w
rG   r  )r  r  r  r  r  rW  r3  r  ru   )DrV   r  r  dataclassesr   typingr   r   r   r   r   numpyr"   r'   r	   torch.nnr
   r   r   activationsr   r   
generationr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_xlmr   
get_loggerrR   r  r3   rF   rI   Moduler[   ry   r   r   r   r   r   r3  rd  rW  r  r  r  r  r  r  r  __all__rQ   rG   r2   <module>r     sV     ! 9 9    A A / )  . l l 9 9 ( 
		H	%2 3 3 3<#299 #LB BJ>299 >Bm299 m`` `F] ]@RYY * $ $ $N '?K '? '?T rm! rm rmj'299 'T h
+_ h
h
V j
#5 j
j
Z d
$6 d
d
N w
0 w
 w
t X
 2 X
 X
v C
- C
 C
L	rG   