
    fTh                        S r SSKrSSKJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJrJr  SSKJr  SS	KJrJrJrJrJrJrJrJr  SS
KJr  SSKJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r) " S S\	RT                  5      r+ " S S\	RT                  5      r, " S S\	RT                  5      r- " S S\	RT                  5      r. " S S\	RT                  5      r/ " S S\	RT                  5      r0 " S S\	RT                  5      r1 " S S\	RT                  5      r2 " S S \	RT                  5      r3 " S! S"\	RT                  5      r4\# " S# S$\5      5       r5\#" S%S&9 " S' S(\55      5       r6\#" S)S&9 " S* S+\5\5      5       r7\# " S, S-\55      5       r8 " S. S/\	RT                  5      r9\#" S0S&9 " S1 S2\55      5       r:\# " S3 S4\55      5       r;\# " S5 S6\55      5       r< " S7 S8\	RT                  5      r=\# " S9 S:\55      5       r>S=S; jr?/ S<Qr@g)>zPyTorch X-MOD model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
XmodConfigc                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )XmodEmbeddings/   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr&   register_buffertorcharangeexpandzerosr(   sizelongr#   selfconfig	__class__s     ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/xmod/modeling_xmod.pyr/   XmodEmbeddings.__init__5   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr)   r   r+   r   r-   devicer'   )"create_position_ids_from_input_idsr#   &create_position_ids_from_inputs_embedsrD   hasattrr+   rB   r@   rC   rE   r(   rO   r4   r8   r&   r6   r9   r=   )rG   	input_idsr+   r(   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr8   
embeddingsr6   s                rJ   forwardXmodEmbeddings.forwardN   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
rL   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr)   r   rN   r   )rD   r@   rA   r#   rE   rO   	unsqueezerB   )rG   rT   rV   sequence_lengthr(   s        rJ   rQ   5XmodEmbeddings.create_position_ids_from_inputs_embedsv   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rL   )r9   r=   r#   r&   r6   r8   r4   )NNNNr   )
__name__
__module____qualname____firstlineno____doc__r/   r[   rQ   __static_attributes____classcell__rI   s   @rJ   r    r    /   s$    

4 rs&P= =rL   r    c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )XmodSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r&   r'   relative_keyrelative_key_query   r   )r.   r/   r2   num_attention_headsrR   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer;   attention_probs_dropout_probr=   r>   r&   r5   r0   distance_embedding
is_decoderrG   rH   r&   rI   s      rJ   r/   XmodSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++rL   xreturnc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr)   r   rq   r   r   )rD   rr   ru   viewpermute)rG   r   new_x_shapes      rJ   transpose_for_scores&XmodSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$rL   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r   rq   dimr)   ro   rp   rN   r,   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) rx   r   ry   rz   r@   catr}   matmul	transposer&   shapetensorrE   rO   r   rA   r|   r5   tor-   einsummathsqrtru   r   
functionalsoftmaxr=   r   
contiguousrD   rv   )rG   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               rJ   r[   XmodSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11GrL   )rv   ru   r|   r=   r}   ry   r5   rr   r&   rx   rz   NNNNNNF)ra   rb   rc   rd   r/   r@   Tensorr   r   FloatTensorr   boolr[   rf   rg   rh   s   @rJ   rj   rj      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c crL   rj   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )XmodSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr$   )r.   r/   r   rw   r2   denser9   r:   r;   r<   r=   rF   s     rJ   r/   XmodSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rL   r   input_tensorr   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   )r   r=   )rG   r   r   s      rJ   r[   XmodSelfOutput.forward  s,    

=1]3%4rL   )r9   r   r=   
ra   rb   rc   rd   r/   r@   r   r[   rf   rg   rh   s   @rJ   r   r     s6    >U\\  RWR^R^  rL   r   c                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )XmodAttentioni  c                    > [         TU ]  5         [        XS9U l        [	        U5      U l        [        5       U l        UR                  U l        g )Nr&   )	r.   r/   rj   rG   r   outputsetpruned_headspre_normr~   s      rJ   r/   XmodAttention.__init__  s;    %f^	$V,ErL   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rG   rr   ru   r   r   rx   ry   rz   r   r   rv   union)rG   headsindexs      rJ   prune_headsXmodAttention.prune_heads'  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rL   r   r   r   r   r   r   r   r   c           	      $   UnU R                   (       a  U R                  R                  U5      nU R                  UUUUUUU5      n	U R                  U	S   U5      n
U R                   (       d  U R                  R                  U
5      n
U
4U	SS  -   nU$ )Nr   r   )r   r   r9   rG   )rG   r   r   r   r   r   r   r   residualself_outputsattention_outputr   s               rJ   r[   XmodAttention.forward9  s     !== KK11-@Myy!"
  ;;|AA}}#{{445EF#%QR(88rL   )r   r   r   rG   r   r   )ra   rb   rc   rd   r/   r   r@   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r   r     s    (;* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 rL   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )XmodIntermediateiW  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r.   r/   r   rw   r2   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrF   s     rJ   r/   XmodIntermediate.__init__X  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rL   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rG   r   s     rJ   r[   XmodIntermediate.forward`  s&    

=100?rL   r   r   rh   s   @rJ   r   r   W  s(    9U\\ ell  rL   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )XmodAdapterif  c                   > [         TU ]  5         UR                  UR                  -  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " U R                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r.   r/   r2   adapter_reduction_factorbottleneck_sizer   rw   dense1dense2r   r   r   r   adapter_act_fnrF   s     rJ   r/   XmodAdapter.__init__g  s    %11V5T5TTii 2 2D4H4HIii 4 4f6H6HIf''--"():):";D"("3"3DrL   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   s     rJ   r[   XmodAdapter.forwardq  s4    M2++M:M2rL   )r   r   r   r   r   rh   s   @rJ   r   r   f  s(    4U\\ ell  rL   r   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS	r	U =r
$ )

XmodOutputix  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        UR                  U l	        [        R                  " UR                  5      U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        UR                  U l        [        R                  " 0 5      U l        UR"                   H$  n[%        U5      U R                   ['        U5      '   M&     g r   )r.   r/   r   rw   r   r2   r   r9   r:   ln_before_adapterr;   r<   r=   adapter_layer_normadapter_reuse_layer_norm
ModuleDictadapter_modules	languagesr   r   )rG   rH   languagerI   s      rJ   r/   XmodOutput.__init__y  s    YYv779K9KL
f&8&8f>S>ST!'!9!9zz&"<"<=$$&(ll63E3E6K`K`&aD#&*D#(.(G(G%!}}R0((H2=f2ED  X/ )rL   r   r   lang_idsr   c                 t    U R                  U5      nU R                  U5      nX-   nU R                  X15      nU$ r   )r   r=   lang_adapter)rG   r   r   r   s       rJ   r[   XmodOutput.forward  s<    

=1]3%4))(BrL   c                    [         R                  " USS9u  pU R                  (       d  UnU R                  b  U R                  U5      nO"U R                  (       a  U R                  U5      nU R                  (       a  Un[         R                  " X#R                  5       S5      n/ n[        [        X5      5       Hi  u  nu  p[        U R                  R                  5       5      [        UR                  5       5         n
UR                  U R                  U
   " U	5      5        Mk     [         R                   " US5      nU R#                  U5      nUW-  nU$ )NT)return_countsr   )r@   unique_consecutiver   r   r   r9   splittolist	enumerateziplistr   keysrt   itemappendr   r=   )rG   r   r   lang_lengthsr   split_hidden_stateslang_wise_outputsilang_idsplit_hidden_statelangs              rJ   r   XmodOutput.lang_adapter  s   !&!9!9(RV!W%%$H"". 33MBM** NN=9M!!$H#kk-9L9L9NPQR09#h:\0],A,,,1134S5HID$$T%9%9$%?@R%ST 1^ 		"3Q7]3!rL   )r9   r   r   r   r   r=   r   )ra   rb   rc   rd   r/   r@   r   r[   r   rf   rg   rh   s   @rJ   r   r   x  s`    FU\\  Y^YeYe jojvjv U\\ %,,  rL   r   c                   B  ^  \ rS rSrU 4S jr      SS\R                  S\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\	\R                           S
\\
   S\	\R                     4S jjrS rSrU =r$ )	XmodLayeri  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        UR                  U l        g )Nr   z> should be used as a decoder model if cross attention is addedr'   r   )r.   r/   chunk_size_feed_forwardseq_len_dimr   	attentionr}   add_cross_attentionrs   crossattentionr   intermediater   r   r   rF   s     rJ   r/   XmodLayer.__init__  s    '-'E'E$&v. ++#)#=#= ##?? D6)g!hii"/PZ"[D,V4 (rL   r   r   r   r   r   r   r   r   r   c	           	         Ub  US S OS n	U R                  UUUUU	S9n
U
S   nU R                  (       a  U
SS nU
S   nOU
SS  nS nU R                  (       a[  UbX  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  UUUUUUU5      nUS   nUUSS -   nUS   nWU-   nUnU R
                  (       a  U R                  R                  U5      n[        U R                  U R                  U R                  U5      nU R                  UUU5      nU R
                  (       d  U R                  R                  U5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nrq   )r   r   r   r   r)   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r  r}   rR   rs   r  r   r   r9   r   feed_forward_chunkr  r  )rG   r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputsr   intermediate_outputlayer_outputs                       rJ   r[   XmodLayer.forward  s    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P#==#{{445EF7##((	
 {{#6(K}};;00>L/G+ ??!2 44GrL   c                 $    U R                  U5      $ r   )r  )rG   r   s     rJ   r  XmodLayer.feed_forward_chunk  s      !122rL   )	r  r  r  r  r  r}   r   r   r  r   )ra   rb   rc   rd   r/   r@   r   r   r   r   r   r[   r  rf   rg   rh   s   @rJ   r  r    s    (& 7;15=A>BDH,1I||I ,,I !!2!23	I
 E--.I  ((9(9:I !)):): ;I !uU->->'?!@AI $D>I 
u||	IV3 3rL   r  c                   j  ^  \ rS rSrU 4S jr         SS\R                  S\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\	\	\R                           S
\\
   S\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )XmodEncoderi	  c                 v  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        UR                  U l
        U R                  (       a.  [        R                  " UR                  UR                  S9U l        SU l        g s  snf )Nr$   F)r.   r/   rH   r   
ModuleListrangenum_hidden_layersr  layerr   is_pre_normr9   r2   r:   gradient_checkpointing)rG   rH   _rI   s      rJ   r/   XmodEncoder.__init__
  s    ]]uVE]E]?^#_?^!If$5?^#_`
!??\\&*<*<&BWBWXDN&+#	 $`s   B6r   r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                    U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU
(       a  SOS nU	(       a  SOS nU	(       a  U R                  R
                  (       a  SOS nU(       a  SOS n[        U R                  5       H  u  nnU
(       a  X4-   nUb  UU   OS nUb  UU   OS nU R                   (       a5  U R                  (       a$  U R                  UR                  UUUUUUUU	5	      nOU" UUUUUUUU	5      nUS   nU(       a	  UUS   4-  nU	(       d  M  UUS   4-   nU R                  R
                  (       d  M  UUS   4-   nM     U R                  (       a  U R                  U5      nU
(       a  X4-   nU(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F r   r)   r   rq   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r8  ).0vs     rJ   	<genexpr>&XmodEncoder.forward.<locals>.<genexpr>Z  s"      
A  s   	)last_hidden_stater4  r   
attentionscross_attentions)r1  trainingloggerwarning_oncerH   r  r  r/  _gradient_checkpointing_func__call__r0  r9   tupler   )rG   r   r   r   r   r   r   r4  r   r   r5  r6  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacher  layer_modulelayer_head_maskr   layer_outputss                        rJ   r[   XmodEncoder.forward  s    &&4==##p "	"6BD$5b4%64;;;Z;Zr`d#,R$(4OA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%
! !-!"#)*"%	! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(K  5N  NN=9M 14D D 
 "&%'(
 
 
 9+.+*1
 	
rL   )r9   rH   r1  r0  r/  )	NNNNNNFFT)ra   rb   rc   rd   r/   r@   r   r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r*  r*  	  s    , 7;15=A>BEI$(,1/4&*X
||X
 ,,X
 !!2!23	X

 E--.X
  ((9(9:X
 !)):): ;X
 "%e.?.?(@"ABX
 D>X
 $D>X
 'tnX
 d^X
 
uU\\"$MM	NX
 X
rL   r*  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
XmodPoolerio  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r.   r/   r   rw   r2   r   Tanh
activationrF   s     rJ   r/   XmodPooler.__init__p  s9    YYv1163E3EF
'')rL   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   rS  )rG   r   first_token_tensorpooled_outputs       rJ   r[   XmodPooler.forwardu  s6     +1a40

#566rL   )rS  r   r   rh   s   @rJ   rP  rP  o  s(    $
U\\ ell  rL   rP  c                   :    \ rS rSr\rSrSrS rS\	4S jr
S rSrg	)
XmodPreTrainedModeli~  robertaTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsg        )meanstdNg      ?)r   r   rw   weightdatanormal_rH   initializer_rangebiaszero_r0   r#   r9   fill_
XmodLMHead)rG   modules     rJ   _init_weights!XmodPreTrainedModel._init_weights  s2   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)
++KK""$ ,rL   r   c           	          XR                   R                  ;  a0  [        U  SU S[        U R                   R                  5       35      eXR                   l        g)z
Set the default language code for the model. This is used when the language is not specified in the input.

Args:
    language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
z does not have an adapter for z. Supported languages: N)rH   r   rs   r  default_language)rG   r   s     rJ   set_default_language(XmodPreTrainedModel.set_default_language  sW     ;;000&6xj@WX\]a]h]h]r]rXsWtu  (0$rL   c                     [         R                  S5        U R                  R                  R	                  5        H
  nSUl        M     [         R                  S5        U R                  R                  R                   H~  nUR                  R                  b2  UR                  R                  R	                  5        H
  nSUl        M     UR                  R                  R	                  5        H
  nSUl        M     M     g)z
Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
fine-tuned on a downstream task.
zFreezing embeddingsFzFreezing adaptersN)rB  infor\  rZ   
parametersrequires_gradencoderr/  r   r   r   )rG   	parameterr/  s      rJ   'freeze_embeddings_and_language_adapters;XmodPreTrainedModel.freeze_embeddings_and_language_adapters  s    
 	)*00;;=I&+I# >'(\\))//E||..:!&!@!@!K!K!MI.3I+ "N"\\99DDF	*/	' G	 0rL   r8  N)ra   rb   rc   rd   r   config_classbase_model_prefixsupports_gradient_checkpointingri  r   rm  ru  rf   r8  rL   rJ   r[  r[  ~  s)    L!&*#%$0S 00rL   r[  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc            "         ^  \ rS rSrSU 4S jjrS rS rS r\              SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )	XmodModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r.   r/   rH   r    rZ   r*  rs  rP  pooler	post_init)rG   rH   add_pooling_layerrI   s      rJ   r/   XmodModel.__init__  sK    
 	 (0"6*,=j(4 	rL   c                 .    U R                   R                  $ r   rZ   r4   rG   s    rJ   get_input_embeddingsXmodModel.get_input_embeddings  s    ...rL   c                 $    XR                   l        g r   r  )rG   rz   s     rJ   set_input_embeddingsXmodModel.set_input_embeddings  s    */'rL   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrs  r/  r  r   )rG   heads_to_pruner/  r   s       rJ   _prune_headsXmodModel._prune_heads  s<    
 +002LELLu%//;;EB 3rL   rS   r   r   r+   r(   r   rT   r   r   r4  r   r   r5  r6  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  Ub  UOU R                   R
                  nOSnUb  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUu  nnUb  UR                  OUR                  nU
b  U
S   S   R                  S   OSnUc  U R                   R                  c  [        S5      e[        U R                  R                  S   R                  R                   R#                  5       5      nUR%                  U R                   R                  5      nU[&        R(                  " UUS	9-  nUc  [&        R(                  " UUU-   4US	9nUcs  [+        U R,                  S
5      (       a4  U R,                  R.                  SS2SU24   nUR1                  UU5      nUnO$[&        R2                  " U[&        R4                  US9nU R7                  X?5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nU	c  [&        R(                  " UUS	9n	U R9                  U	5      nOSnU R;                  X`R                   R<                  5      nU R-                  UUUUUS9nU R                  UUUUUUU
UUUUS9nUS   n U R>                  b  U R?                  U 5      OSn!U(       d
  U U!4USS -   $ [A        U U!URB                  URD                  URF                  URH                  S9$ )
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
NFzDYou cannot specify both input_ids and inputs_embeds at the same timer)   z5You have to specify either input_ids or inputs_embedsr   rq   zPInput language unknown. Please call `XmodPreTrainedModel.set_default_language()`)rO   r+   rN   )rS   r(   r+   rT   rU   )
r   r   r   r   r   r4  r   r   r5  r6  r   )r>  pooler_outputr4  r   r?  r@  )%rH   r   r5  use_return_dictr}   r   rs   %warn_if_padding_and_no_attention_maskrD   rO   r   rl  r  rs  r/  r   r   r  r   r@   onesrR   rZ   r+   rB   rC   rE   get_extended_attention_maskinvert_attention_maskget_head_maskr.  r~  r   r4  r   r?  r@  )"rG   rS   r   r   r+   r(   r   rT   r   r   r4  r   r   r5  r6  rV   
batch_sizerW   rO   rU   adapter_languagesdefault_lang_idrX   rY   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr2  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputrX  s"                                     rJ   r[   XmodModel.forward  s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de{{++3 !stt $T\\%7%7%:%A%A%Q%Q%V%V%X Y/55dkk6R6RSO&Jv)NNH!"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rL   )rH   rZ   rs  r~  )T)NNNNNNNNNNNNNN)ra   rb   rc   rd   r/   r  r  r  r   r   r@   r   
LongTensorr   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r|  r|    s    "/0C  -1/31515/3,0048<9==A$(,0/3&*z
ELL)z
 5++,z
 !.	z

 !.z
 u||,z
 ELL)z
  -z
  (5z
 !) 6z
 "$u'8'8"9:z
 D>z
 $D>z
 'tnz
 d^z
  
uU\\"$PP	Q!z
 z
rL   r|  zQ
    X-MOD Model with a `language modeling` head on top for CLM fine-tuning.
    c            $         ^  \ rS rSrSS/rU 4S jrS rS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\\
R                           S\	\   S\	\   S\	\   S\	\   S\\\
R                      \4   4 S jj5       rS rSrU =r$ )XmodForCausalLMid  lm_head.decoder.weightlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r.   r/   r}   rB  warningr|  r\  rg  lm_headr  rF   s     rJ   r/   XmodForCausalLM.__init__m  sL       NNij 5A!&) 	rL   c                 .    U R                   R                  $ r   r  decoderr  s    rJ   get_output_embeddings%XmodForCausalLM.get_output_embeddingsz      ||###rL   c                 $    XR                   l        g r   r  rG   new_embeddingss     rJ   set_output_embeddings%XmodForCausalLM.set_output_embeddings~      -rL   rS   r   r   r+   r(   r   rT   r   r   labelsr4  r   r   r5  r6  r   c                    Ub  UOU R                   R                  nU
b  SnU R                  UUUUUUUUU	UUUUUS9nUS   nU R                  U5      nSnU
b*  U R                  " UU
4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
>>> config = AutoConfig.from_pretrained("facebook/xmod-base")
>>> config.is_decoder = True
>>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
>>> model.set_default_language("en_XX")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NF)r   r   r+   r(   r   rT   r   r   r4  r   r   r5  r6  r   r1   rq   )losslogitsr4  r   r?  r@  )rH   r  r\  r  loss_functionr1   r   r4  r   r?  r@  )rG   rS   r   r   r+   r(   r   rT   r   r   r  r4  r   r   r5  r6  kwargsr   r  prediction_scoreslm_lossr   s                         rJ   r[   XmodForCausalLM.forward  s"   \ &1%<k$++B]B]I,,))%'"7#9+/!5#  
" "!* LL9((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rL   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr8  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   rO   )r:  
past_statebeam_idxs     rJ   r<  1XmodForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)rF  )rG   r4  r  reordered_past
layer_pasts     `  rJ   _reorder_cacheXmodForCausalLM._reorder_cache  s8    )Jncmnn N * rL   r  r\  )NNNNNNNNNNNNNNN)ra   rb   rc   rd   _tied_weights_keysr/   r  r  r   r   r@   r  r   r   r   r   r   r   r[   r  rf   rg   rh   s   @rJ   r  r  d  s    34JK
$.  15/36:59371559=A>B-1EI$(,0/3&*!Y
E,,-Y
 5++,Y
 !!2!23	Y

 !!1!12Y
 u//0Y
 E--.Y
   1 12Y
  ((9(9:Y
 !)):): ;Y
 ))*Y
 "%e.?.?(@"ABY
 D>Y
 $D>Y
 'tnY
  d^!Y
$ 
uU\\"$EE	F%Y
 Y
x rL   r  c                      ^  \ rS rSrSS/rU 4S jrS rS r\             SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSrU =r$ )XmodForMaskedLMi  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rF   s     rJ   r/   XmodForMaskedLM.__init__  sR     NN1
 !5A!&) 	rL   c                 .    U R                   R                  $ r   r  r  s    rJ   r  %XmodForMaskedLM.get_output_embeddings  r  rL   c                 $    XR                   l        g r   r  r  s     rJ   r  %XmodForMaskedLM.set_output_embeddings   r  rL   rS   r   r   r+   r(   r   rT   r   r   r  r   r5  r6  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUS9nUS   nU R                  U5      nSnU
bF  [	        5       nU" UR                  SU R                   R                  5      U
R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )av  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)r   r   r+   r(   r   rT   r   r   r   r5  r6  r   r)   rq   r  r  r   r?  )
rH   r  r\  r  r	   r   r1   r   r   r?  )rG   rS   r   r   r+   r(   r   rT   r   r   r  r   r5  r6  r   r  r  masked_lm_lossloss_fctr   s                       rJ   r[   XmodForMaskedLM.forward  s   4 &1%<k$++B]B],,))%'"7#9/!5#  
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rL   r  )NNNNNNNNNNNNN)ra   rb   rc   rd   r  r/   r  r  r   r   r@   r  r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r  r    sr   24JK $.  15/36:59371559=A>B-1,0/3&*:
E,,-:
 5++,:
 !!2!23	:

 !!1!12:
 u//0:
 E--.:
   1 12:
  ((9(9::
 !)):): ;:
 ))*:
 $D>:
 'tn:
 d^:
 
uU\\"N2	3:
 :
rL   r  c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )rg  iB  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g r   )r.   r/   r   rw   r2   r   r9   r:   
layer_normr1   r  	Parameterr@   rC   rd  rF   s     rJ   r/   XmodLMHead.__init__E  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrL   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  r  rG   featuresr  r   s       rJ   r[   XmodLMHead.forwardN  s;    JJx GOOA LLOrL   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)r  rd  rO   typer  s    rJ   _tie_weightsXmodLMHead._tie_weightsX  sC     <<##((F2 $		DLL))DIrL   )rd  r  r   r  )
ra   rb   rc   rd   re   r/   r[   r  rf   rg   rh   s   @rJ   rg  rg  B  s    4&* *rL   rg  z
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )XmodForSequenceClassificationia  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr  )	r.   r/   
num_labelsrH   r|  r\  XmodClassificationHead
classifierr  rF   s     rJ   r/   &XmodForSequenceClassification.__init__i  sH      ++ 5A08 	rL   rS   r   r   r+   r(   r   rT   r  r   r5  r6  r   c                 2   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU(       d  U4US	S -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S
9$ )aa  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N	r   r   r+   r(   r   rT   r   r5  r6  r   r   
regressionsingle_label_classificationmulti_label_classificationr)   rq   r  )rH   r  r\  r  problem_typer  r-   r@   rE   rt   r
   squeezer	   r   r   r   r   r?  rG   rS   r   r   r+   r(   r   rT   r  r   r5  r6  r   r  r  r  r  r   s                     rJ   r[   %XmodForSequenceClassification.forwardt  s   0 &1%<k$++B]B],,))%'/!5#  
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rL   )r  rH   r  r\  NNNNNNNNNNN)ra   rb   rc   rd   r/   r   r   r@   r  r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r  r  a  s;   	  15/36:59371559-1,0/3&*H
E,,-H
 5++,H
 !!2!23	H

 !!1!12H
 u//0H
 E--.H
   1 12H
 ))*H
 $D>H
 'tnH
 d^H
 
uU\\"$<<	=H
 H
rL   r  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )XmodForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r.   r/   r|  r\  r   r;   r<   r=   rw   r2   r  r  rF   s     rJ   r/   XmodForMultipleChoice.__init__  sV      (zz&"<"<=))F$6$6: 	rL   rS   r   r+   r   r  r(   r   rT   r   r5  r6  r   c                    Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb2  UR                  UR	                  S5      UR	                  S5      -  5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
lang_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr   r)   r   r   )	r   r(   r+   r   r   rT   r   r5  r6  rq   r  )rH   r  r   r   rD   repeatr\  r=   r  r	   r   r   r?  )rG   rS   r   r+   r   r  r(   r   rT   r   r5  r6  num_choicesflat_input_idsflat_lang_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   rX  r  reshaped_logitsr  r  r   s                             rJ   r[   XmodForMultipleChoice.forward  s   ` &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bRZRf	q(9INN1<M(MNlpLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,"*..,/!5#  
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rL   )r  r=   r\  r  )ra   rb   rc   rd   r/   r   r   r@   r  r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r  r    s;     15/3596:-1371559,0/3&*]
E,,-]
 5++,]
 !!1!12	]

 !!2!23]
 ))*]
 u//0]
 E--.]
   1 12]
 $D>]
 'tn]
 d^]
 
uU\\"$==	>]
 ]
rL   r  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )XmodForTokenClassificationi.  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r.   r/   r  r|  r\  classifier_dropoutr<   r   r;   r=   rw   r2   r  r  rG   rH   r  rI   s      rJ   r/   #XmodForTokenClassification.__init__1  s      ++ 5A)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rL   rS   r   r   r+   r(   r   rT   r  r   r5  r6  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r)   rq   r  )rH   r  r\  r=   r  r	   r   r  r   r   r?  r  s                     rJ   r[   "XmodForTokenClassification.forward?  s    , &1%<k$++B]B],,))%'/!5#  
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rL   )r  r=   r  r\  r  )ra   rb   rc   rd   r/   r   r   r@   r  r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r  r  .  s-     15/36:59371559-1,0/3&*7
E,,-7
 5++,7
 !!2!23	7

 !!1!127
 u//07
 E--.7
   1 127
 ))*7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
rL   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i{  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r.   r/   r   rw   r2   r   r  r<   r;   r=   r  out_projr  s      rJ   r/   XmodClassificationHead.__init__~  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrL   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rV  )r=   r   r@   tanhr  r  s       rJ   r[   XmodClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rL   )r   r=   r  )	ra   rb   rc   rd   re   r/   r[   rf   rg   rh   s   @rJ   r  r  {  s    7I rL   r  c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )XmodForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r.   r/   r  r|  r\  r   rw   r2   
qa_outputsr  rF   s     rJ   r/   !XmodForQuestionAnswering.__init__  sU      ++ 5A))F$6$68I8IJ 	rL   rS   r   r   r+   r(   r   rT   start_positionsend_positionsr   r5  r6  r   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUU
UUS9
nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  U	b  [        UR                  5       5      S:  a  UR                  S5      n[        U	R                  5       5      S:  a  U	R                  S5      n	UR                  S5      nUR                  SU5      nU	R                  SU5      n	[        US9nU" UU5      nU" UU	5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
r  Nr  r   r   r)   r   )ignore_indexrq   )r  start_logits
end_logitsr   r?  )rH   r  r\  r  r  r  r   r   rD   clampr	   r   r   r?  )rG   rS   r   r   r+   r(   r   rT   r   r!  r   r5  r6  r   r  r  r$  r%  
total_lossignored_indexr  
start_lossend_lossr   s                           rJ   r[    XmodForQuestionAnswering.forward  s   * &1%<k$++B]B],,))%'/!5#  
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rL   )r  r  r\  )NNNNNNNNNNNN)ra   rb   rc   rd   r/   r   r   r@   r  r   r   r   r   r   r   r[   rf   rg   rh   s   @rJ   r  r    sT     15/36:593715596:48,0/3&*E
E,,-E
 5++,E
 !!2!23	E

 !!1!12E
 u//0E
 E--.E
   1 12E
 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
uU\\"$@@	AE
 E
rL   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )nert   r@   cumsumtype_asrE   )rS   r#   rU   maskincremental_indicess        rJ   rP   rP     sW     <<$((*D <<!4<<TBE[[_cc##%33rL   )r  r  r  r  r  r  r|  r[  )r   )Are   r   typingr   r   r   r   r@   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_xmodr   
get_loggerra   rB  Moduler    rj   r   r   r   r   r   r  r*  rP  r[  r|  r  r  rg  r  r  r  r  r  rP   __all__r8  rL   rJ   <module>r?     sP     / /    A A ' )	 	 	 . l l , * 
		H	%V=RYY V=tC		 CLRYY 5BII 5rryy ")) $/ /d\3		 \3~b
")) b
L  30/ 30 30l ^
# ^
^
B 
{)? {
{| V
) V
 V
t* *> V
$7 V
V
r j
/ j
 j
Z H
!4 H
 H
XRYY , R
2 R
 R
l4 	rL   