
    fTh`                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r)Sq*S r+S r,SHS jr-SHS jr.SHS jr/S r0 " S S\Rb                  Rd                  5      r3 " S S\Rb                  Rd                  5      r4 " S S5      r5SIS jr6S r7   SJS jr8 " S S \
Rr                  5      r: " S! S"\
Rr                  5      r; " S# S$\
Rr                  5      r< " S% S&\
Rr                  5      r= " S' S(\
Rr                  5      r> " S) S*\
Rr                  5      r? " S+ S,\
Rr                  5      r@ " S- S.\
Rr                  5      rA " S/ S0\
Rr                  5      rB " S1 S2\
Rr                  5      rC " S3 S4\
Rr                  5      rD\! " S5 S6\5      5       rE\! " S7 S8\E5      5       rF\! " S9 S:\E5      5       rG " S; S<\
Rr                  5      rH\!" S=S>9 " S? S@\E5      5       rI\! " SA SB\E5      5       rJ\! " SC SD\E5      5       rK\! " SE SF\E5      5       rL/ SGQrMg)KzPyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                     ^ [        [        5      R                  5       R                  R                  R                  S-  S-  mU4S jn U " / SQ5      n[	        SUSS9qg )Nkernelsmrac                 :   > U  Vs/ s H  nTU-  PM
     sn$ s  snf N )filesfile
src_folders     \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mra/modeling_mra.pyappend_root&load_cuda_kernels.<locals>.append_root3   s     .34ed
T!e444s   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr   mra_cuda_kernel)r(   	src_filesr&   s     @r'   load_cuda_kernelsr1   /   sQ    h'')0077>>JURJ5 WXI=)TBO    c                 H   [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S	S
9R                  R                  SS	5      nUR                  5       nUR                  5       nUR                  5       n[        R                  XAX#5      u  pVUR                  SS	5      SS2SS2SSS24   nXV4$ )z0
Computes maximum values for softmax stability.
   z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr/   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r'   
sparse_maxrL   ;   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::P_!oH'11"b9!Qa-H%%r2   c                    [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S   UR                  S   :w  a  [        S5      eU R                  u  p4XB-  n[        R
                  " UR                  S5      [        R                  UR                  S9nU R                  X5U5      n XSS2S4   X-  R                  5       SS24   n U $ )zF
Converts attention mask to a sparse mask for high resolution logits.
r5   z$mask must be a 2-dimensional tensor.r6   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r<   r=   r>   shapetorcharangelongrP   reshape)maskrF   
block_size
batch_sizeseq_len	num_block	batch_idxs          r'   sparse_maskr\   W   s     499;1?@@
7<<>aBCCzz!}a((]^^**J%IW\\!_EJJw~~VI<<
z:D!T'"W%8$>$>$@!CDDKr2   c                 f   U R                  5       u  pEnUR                  5       u  pxnXS-  S:w  a  [        S5      eX-  S:w  a  [        S5      eU R                  XEU-  X65      R                  SS5      n UR                  XHU-  X65      R                  SS5      n[	        U R                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S	:w  a  [        S
5      eU R                  S5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XUR                  5       5      $ )z/
Performs Sampled Dense Matrix Multiplication.
r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r;   r8   r4   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r5   r6   r   r7   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r=   r>   rU   rA   r<   rB   rC   r/   mm_to_sparse)	dense_query	dense_keyrF   rW   rX   
query_sizer:   _key_sizes	            r'   r`   r`   n   s    #."2"2"4JC ~~'A!#opp!kll%%j
2JJ\ffgikmnK!!**.DjV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''NNr2   c                 B   UR                  5       u  pVnXd-  S:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eUR                  XVU-  XG5      R                  SS5      n[	        U R                  5       5      S	:w  a  [        S
5      e[	        UR                  5       5      S	:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XX#5      nUR                  SS5      R                  XSU-  U5      nU$ )zH
Performs matrix multiplication of a sparse matrix with a dense matrix.
r   r^   r5   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r;   r8   r4   ,sparse_query must be a 4-dimensional tensor.r_   r6   r7   z8The size of the third dimension of dense_key must be 32.)	r=   r>   rU   rA   r<   rB   rC   r/   sparse_dense_mm)	sparse_queryrF   rb   rG   rW   rX   re   r:   dense_qk_prods	            r'   rh   rh      s    !* 0J#!kllz)lmmz)kll!!**.DjV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L9fM!++B3;;JZdHdfijMr2   c                 X    X-  U-  [         R                  " XSS9-   R                  5       $ )Nfloorrounding_mode)rR   divrT   )rF   dim_1_blockdim_2_blocks      r'   transpose_indicesrr      s*    "k1EIIgbi4jjpprrr2   c                   H    \ rS rSr\S 5       r\S 5       r\SS j5       rSrg)MraSampledDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ r"   )r`   save_for_backwardrW   )ctxra   rb   rF   rW   rE   s         r'   forwardMraSampledDenseMatMul.forward   s)    %kgRkg>#r2   c                     U R                   u  p#nU R                  nUR                  S5      U-  nUR                  S5      U-  n[        XFU5      n[	        UR                  SS5      XU5      n	[	        XX65      n
XS S 4$ Nr   r;   r8   )saved_tensorsrW   r=   rr   rh   rA   )rx   gradra   rb   rF   rW   rG   rH   	indices_Tgrad_key
grad_querys              r'   backwardMraSampledDenseMatMul.backward   s    *-*;*;'^^
%**1-;!q)Z7%gN	"4>>"b#99S`a$TIO
T4//r2   c                 .    [         R                  XX#5      $ r"   )rt   apply)ra   rb   rF   rW   s       r'   operator_call#MraSampledDenseMatMul.operator_call   s    $**;7WWr2   r#   Nr7   	__name__
__module____qualname____firstlineno__staticmethodry   r   r   __static_attributes__r#   r2   r'   rt   rt      s>      0 0 X Xr2   rt   c                   D    \ rS rSr\S 5       r\S 5       r\S 5       rSrg)MraSparseDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ r"   )rh   rw   rG   )rx   ri   rF   rb   rG   rE   s         r'   ry   MraSparseDenseMatMul.forward   s*    (	[lY?-r2   c                     U R                   u  p#nU R                  nUR                  S5      UR                  S5      -  n[        X5U5      n[	        UR                  SS5      XqU5      n[        XU5      n	U	S US 4$ r|   )r}   rG   r=   rr   rh   rA   r`   )
rx   r~   ri   rF   rb   rG   rH   r   r   r   s
             r'   r   MraSparseDenseMatMul.backward   s~    +.+<+<(y--!q)\->->r-BB%gN	"<#9#9"b#A9Tab!$7;
44//r2   c                 .    [         R                  XX#5      $ r"   )r   r   )ri   rF   rb   rG   s       r'   r   "MraSparseDenseMatMul.operator_call   s    #)),\\r2   r#   Nr   r#   r2   r'   r   r      s>      0 0 ] ]r2   r   c                   $    \ rS rSr\S 5       rSrg)MraReduceSum   c                    U R                  5       u  pEpg[        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  5       u    pvnUR                  5       u  pEU R                  SS9R	                  XE-  U5      n [
        R                  " UR                  S5      [
        R                  UR                  S9n[
        R                  " XSS	9R                  5       US S 2S 4   U-  -   R	                  XE-  5      n	[
        R                  " XB-  U4U R                  U R                  S9n
U
R                  SX5      R	                  XBU5      nUR	                  XBU-  5      nU$ )
Nr4   rg   r5   r6   r9   r   rN   rl   rm   )r=   r<   r>   sumrU   rR   rS   rT   rP   ro   zerosrO   	index_add)ri   rF   rG   rH   rX   rZ   rW   rd   r[   global_idxestempoutputs               r'   r   MraReduceSum.operator_call   sb   /;/@/@/B,
z|  "#q(KLLw||~!#FGG*//11! '
#''A'.66z7MzZLLa

7>>Z	IIgGDIIKiXY[_X_N`crNrr
'*(
) 	 {{):6l>P>PYeYlYl
 <>FFzdno
j,HIr2   r#   N)r   r   r   r   r   r   r   r#   r2   r'   r   r      s     r2   r   c                    U R                  5       u  pVnXb-  nSn	Ub  UR                  XXU5      R                  SS9n
U R                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUb/  UR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  n	OU[        R                  " XX[        R
                  U R                  S9-  n
U R                  XXX'5      R                  SS9nUR                  XXX'5      R                  SS9nUb  UR                  XXX'5      R                  SS9n	[        R                  " XR                  SS5      5      [        R                  " U5      -  nUR                  SSS9R                  nUb0  US	U
SS2SSS24   U
SS2SS2S4   -  S
:  R                  5       -  -
  nXX4$ )z'
Compute low resolution approximation.
Nr;   r9   r8   ư>rN   T)r:   keepdims     @g      ?)r=   rU   r   rR   onesfloatrP   meanmatmulrA   mathsqrtr?   r@   )querykeyrW   rV   valuerX   rY   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r'   get_low_resolution_logitr     s    %*JJL!J-Ill:*MQQVXQYMM*VZZ_aZb1d
#d*
	 ++jZRVV[]V^1d
#d*
 jZZ^^ce^fAq$J'$.I !5::jSXS^S^glgsgs#ttMM*V[[`b[c	++jZRWW\^W_jZZ__df_gI <<	3D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JUUr2   c                    U R                   u  pVnUS:  a]  US-  n[        R                  " XfU R                  S9n	[        R                  " [        R
                  " X* S9US9n
X
SSS2SS24   S-  -   n US:  a:  U SS2SU2SS24   S-   U SS2SU2SS24'   U SS2SS2SU24   S-   U SS2SS2SU24'   [        R                  " U R                  US5      USSS	S
9nUR                  nUS:X  a@  UR                  R                  SS9R                  nXSS2SS4   :  R                  5       nX4$ US:X  a  SnX4$ [        U S35      e)zR
Compute the indices of the subset of components to be used in the approximation.
r   r5   rP   )diagonalNg     @r;   TF)r:   largestsortedfullr9   sparsez# is not a valid approx_model value.)rQ   rR   r   rP   triltriutopkrU   rF   r@   minr   r>   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrX   total_blocks_per_rowrd   offset	temp_maskdiagonal_mask
top_k_valsrF   	thresholdhigh_resolution_masks                  r'   get_block_idxesr   6  s    +?*D*D'Ja&*0A5JJ3RfRmRmn	

5::i'#JU[\3D!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4!T4-8P PWWY (( 
	 # (( K=(KLMMr2   c	                    [         c$  [        R                  " U 5      R                  5       $ U R	                  5       u  ppX-  nX-  S:w  a  [        S5      eX-  nU R                  XU5      n UR                  XU5      nUR                  XU5      nUb*  XSS2SS2S4   -  n XSS2SS2S4   -  nX#SS2SS2S4   -  nUS:X  a  [        XXcU5      u  nnnnOAUS:X  a0  [        R                  " 5          [        XXc5      u  nnnnSSS5        O[        S5      e[        R                  " 5          WW-
  n[        UUUUU5      u  nnSSS5        [        R                  XWUS9[        R                  " U5      -  n[        UUX5      u  nnUU-
  nUb"  USS	[!        UU5      SS2SS2SS2S4   -
  -  -
  n[        R"                  " U5      n[$        R                  UUX.5      n[&        R                  UUX5      nUS:X  Gax  [        R"                  " WW-
  SW-  -
  5      WSS2SSS24   -  n[        R(                  " UW5      SS2SS2SSS24   R+                  S	S	US	5      R                  XU5      nUR-                  S
S9SS2SS2S4   R+                  S	S	U5      R                  X5      nUR+                  S	S	U5      R                  X5      U-
  n Ub  U U-  n [        R"                  " U U S:*  R/                  5       -  5      n!UU!SS2SS2S4   -  nUU!-  n[        R"                  " U * U S:  R/                  5       -  5      n"UU"SS2SS2S4   -  nUU"-  nUU-   USS2SS2S4   USS2SS2S4   -   S-   -  n#O$US:X  a  UUSS2SS2S4   S-   -  n#O[        S5      eUb  U#USS2SS2S4   -  n#U#R                  XX5      n#U#$ ! , (       d  f       GN= f! , (       d  f       GN= f)z(
Use Mra to approximate self-attention.
Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rW   r   r   r;   r9   r   z-config.approx_mode must be "full" or "sparse")r/   rR   
zeros_likerequires_grad_r=   r>   rU   r   no_grad	Exceptionr   rt   r   r   r   rL   r\   expr   r   r   repeatr   r   )$r   r   r   rV   r   r   rW   r   r   rX   num_headrY   r   
meta_batchr   r   r   r   r   rd   low_resolution_logit_normalizedrF   r   high_resolution_logitrJ   rK   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r'   mra2_attentionr   \  sq    &5577.3jjl+J'&Jq OPP-MM*x8E
++j8
4CMM*x8EQ4Z((At$$Q4Z((fUm
%V
Rk+G 
	 ]]_QiJRN +/KQ _
 @AA	*>A]*]'(7+(+)
%% 
 2??G
 @ 		( ",,A7L]!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu  ".!;!;g'8" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcm 	" 6<<Q:NVVWaknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*RMS _ 
s   1N?,O?
O
O c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )MraEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c           	      f  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  S-   UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      S-   5        [+        USS5      U l        U R#                  S[$        R.                  " U R0                  R3                  5       [$        R4                  U R0                  R6                  S	9S
S9  g )N)padding_idxr5   epsposition_ids)r   r;   position_embedding_typeabsolutetoken_type_idsrN   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrR   rS   expandgetattrr   r   r   r=   rT   rP   selfconfig	__class__s     r'   r   MraEmbeddings.__init__  s?   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mn'.v7PR\']$KK))..0

4K\K\KcKcd 	 	
r2   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr;   r   r   r   rN   r   )r=   r   hasattrr   r  rR   r   rT   rP   r   r   r   r   r   r  )r  	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r'   ry   MraEmbeddings.forward  s:    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r2   )r   r  r   r   r   r   )NNNN	r   r   r   r   __doc__r   ry   r   __classcell__r  s   @r'   r   r     s    Q
(   r2   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )MraSelfAttentioni	  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      e[        S Ln[        5       (       a!  [        5       (       a  U(       d   [        5         UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R*                  " UR,                  5      U l        Ub  UOUR0                  U l        UR2                  S-  UR4                  -  U l        [9        U R6                  [        UR2                  S-  S-  5      5      U l        UR:                  U l        UR<                  U l        UR>                  U l        g ! [         a#  n[        R                  SU 35         S nAGNS nAff = f)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r7   r5   ) r   r   r   num_attention_headsr  r>   r/   r   r   r1   r   loggerwarningrC   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr  r   r   block_per_rowrZ   r   r   r   r   )r  r  r   kernel_loadeder  s        r'   r   MraSelfAttention.__init__
  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 (t3"$$);)=)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$ !88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,+  n!hijhklmmns   
I 
I3I..I3c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr;   r   r5   r   r   )r=   r  r   viewpermute)r  layernew_layer_shapes      r'   transpose_for_scores%MraSelfAttention.transpose_for_scores-  sM    **,s+t/G/GIaIa.bb

O,}}Q1a((r2   c                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nUR	                  5       u  pxpSUS-  -   nUR                  5       R                  SUS5      R                  Xx-  U	5      R                  5       nSnX:  a  XxXU
-
  4n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[        UR                  5       UR                  5       UR                  5       UR                  5       U R                  U R                   U R"                  U R$                  S9nX:  a  US S 2S S 2S S 2S U
24   nUR                  XxX5      nUR'                  S	S
SS5      R)                  5       nUR	                  5       S S U R*                  4-   nUR,                  " U6 nU4nU$ )N      ?r   r   r7   r   r;   r9   )r   r   r   r   r5   r   r8   )r   r-  r   r   r=   squeezer   rU   rC   rR   catr   rP   r   r   rZ   r   r   r   r*  rB   r!  r)  )r  hidden_statesattention_maskmixed_query_layer	key_layervalue_layerquery_layerrX   	num_headsrY   r   gpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                   r'   ry   MraSelfAttention.forward2  s)    JJ}5--dhh}.EF	//

=0IJ//0AB3>3C3C3E0
w ~77""$++Ay!<DDZE[]deiik 	 #!gx7OOH))[%++hOaOa2b$ciklK		9ekk(K[K[.\"]cefI))[%++hOaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 #)!Q9H9*<=M%--jWW%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD "r2   )r!  r   r   r  r   r   r   r  rZ   r   r   r   r"   )	r   r   r   r   r   r-  ry   r   r  r  s   @r'   r  r  	  s    !VF)
0 0r2   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )MraSelfOutputif  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r   r   r   r"  r   denser   r   r   r   r  r  s     r'   r   MraSelfOutput.__init__g  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r2   r3  input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r"   rC  r  r   r  r3  rE  s      r'   ry   MraSelfOutput.forwardm  5    

=1]3}'CDr2   r   rC  r  
r   r   r   r   r   rR   Tensorry   r   r  r  s   @r'   r@  r@  f  6    >U\\  RWR^R^  r2   r@  c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )MraAttentionit  c                 |   > [         TU ]  5         [        XS9U l        [	        U5      U l        [        5       U l        g )N)r   )r   r   r  r  r@  r   setpruned_heads)r  r  r   r  s      r'   r   MraAttention.__init__u  s0    $V]	#F+Er2   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r9   )r<   r   r  r  r   rT  r   r   r   r   r   rC  r!  union)r  headsindexs      r'   prune_headsMraAttention.prune_heads{  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r2   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )r  r   )r  r3  r4  self_outputsattention_outputr=  s         r'   ry   MraAttention.forward  s>    yy?;;|AF#%QR(88r2   )r   rT  r  r"   )	r   r   r   r   r   rZ  ry   r   r  r  s   @r'   rQ  rQ  t  s    ";$ r2   rQ  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r"   )r   r   r   r"  r   intermediate_sizerC  
isinstance
hidden_actstrr   intermediate_act_fnr  s     r'   r   MraIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r2   r3  rF  c                 J    U R                  U5      nU R                  U5      nU$ r"   rC  rh  r  r3  s     r'   ry   MraIntermediate.forward  s&    

=100?r2   rk  rM  r  s   @r'   rb  rb    s(    9U\\ ell  r2   rb  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	MraOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rB  )r   r   r   r"  rd  r   rC  r   r   r   r   r  r  s     r'   r   MraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r2   r3  rE  rF  c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r"   rH  rI  s      r'   ry   MraOutput.forward  rK  r2   rL  rM  r  s   @r'   ro  ro    rO  r2   ro  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )MraLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        [        U5      U l        [        U5      U l
        g Nr   )r   r   chunk_size_feed_forwardseq_len_dimrQ  	attentionadd_cross_attentionrb  intermediatero  r   r  s     r'   r   MraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r2   c                     U R                  X5      nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ r]  )rz  r   feed_forward_chunkrx  ry  )r  r3  r4  self_attention_outputsr_  r=  layer_outputs          r'   ry   MraLayer.forward  sa    !%!N1!4(,0##T%A%A4CSCSUe
  /G+r2   c                 J    U R                  U5      nU R                  X!5      nU$ r"   )r|  r   )r  r_  intermediate_outputr  s       r'   r  MraLayer.feed_forward_chunk  s)    "//0@A{{#6Ir2   )r{  rz  rx  r|  r   ry  r"   )	r   r   r   r   r   ry   r  r   r  r  s   @r'   ru  ru    s    ( r2   ru  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )
MraEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r  r   
ModuleListrangenum_hidden_layersru  r+  gradient_checkpointing)r  r  rd   r  s      r'   r   MraEncoder.__init__  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A&c                 f   U(       a  SOS n[        U R                  5       H^  u  pxU(       a  Xa4-   nU R                  (       a/  U R                  (       a  U R	                  UR
                  UU5      n	OU" X5      n	U	S   nM`     U(       a  Xa4-   nU(       d  [        S X4 5       5      $ [        UUS9$ )Nr#   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr"   r#   ).0vs     r'   	<genexpr>%MraEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)last_hidden_stater3  )	enumerater+  r  training_gradient_checkpointing_func__call__tupler   )
r  r3  r4  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r'   ry   MraEncoder.forward  s     #7BD(4OA#$58H$H!**t}} $ A A ))!"! !-] K)!,M  5   14D DX]$FXXX1++
 	
r2   )r  r  r+  )NNFT)r   r   r   r   r   ry   r   r  r  s   @r'   r  r    s     , "!
 !
r2   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g rB  )r   r   r   r"  r   rC  re  rf  rg  r   transform_act_fnr   r   r  s     r'   r   #MraPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr2   r3  rF  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r"   )rC  r  r   rl  s     r'   ry   "MraPredictionHeadTransform.forward  s4    

=1--m<}5r2   )r   rC  r  rM  r  s   @r'   r  r    s)    UU\\ ell  r2   r  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )MraLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)bias)r   r   r  	transformr   r"  r   r   decoder	ParameterrR   r   r  r  s     r'   r   MraLMPredictionHead.__init__  sm    3F; yy!3!3V5F5FUSLLV->->!?@	 !IIr2   c                 :    U R                   U R                  l         g r"   )r  r  r  s    r'   _tie_weights MraLMPredictionHead._tie_weights  s     IIr2   c                 J    U R                  U5      nU R                  U5      nU$ r"   )r  r  rl  s     r'   ry   MraLMPredictionHead.forward  s$    }5]3r2   )r  r  r  )	r   r   r   r   r   r  ry   r   r  r  s   @r'   r  r    s    && r2   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraOnlyMLMHeadi&  c                 B   > [         TU ]  5         [        U5      U l        g r"   )r   r   r  predictionsr  s     r'   r   MraOnlyMLMHead.__init__'  s    .v6r2   sequence_outputrF  c                 (    U R                  U5      nU$ r"   r  )r  r  prediction_scoress      r'   ry   MraOnlyMLMHead.forward+  s     ,,_=  r2   r  rM  r  s   @r'   r  r  &  s(    7!u|| ! ! !r2   r  c                   &    \ rS rSr\rSrSrS rSr	g)MraPreTrainedModeli0  r    Tc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )r   stdNr0  )re  r   r"  weightdatanormal_r  initializer_ranger  zero_r   r   r   fill_)r  modules     r'   _init_weights MraPreTrainedModel._init_weights7  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r2   r#   N)
r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r#   r2   r'   r  r  0  s     L&*#*r2   r  c                   8  ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )MraModeliH  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r"   )r   r   r  r   r  r  encoder	post_initr  s     r'   r   MraModel.__init__J  s9     '/!&) 	r2   c                 .    U R                   R                  $ r"   r  r   r  s    r'   get_input_embeddingsMraModel.get_input_embeddingsT  s    ...r2   c                 $    XR                   l        g r"   r  )r  r   s     r'   set_input_embeddingsMraModel.set_input_embeddingsW  s    */'r2   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r+  rz  rZ  )r  heads_to_pruner+  rX  s       r'   _prune_headsMraModel._prune_headsZ  s<    
 +002LELLu%//;;EB 3r2   r  r4  r   r   r  r  r  r  rF  c	                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R	                  X5        UR                  5       n	O"Ub  UR                  5       S S n	O[        S5      eU	u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U	[        R                  US9nU R                  X)5      nU R!                  XPR                   R"                  5      nU R                  UUUUS9nU R%                  UUUUUS9nUS	   nU(       d	  U4US
S  -   $ ['        UUR(                  UR*                  UR,                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer;   z5You have to specify either input_ids or inputs_embedsr   r   rN   )r  r   r   r  )r4  r  r  r  r   r   )r  r3  
attentionscross_attentions)r  r  use_return_dictr>   %warn_if_padding_and_no_attention_maskr=   rP   rR   r   r  r  r   r  r   rT   get_extended_attention_maskget_head_maskr  r  r   r3  r  r  )r  r  r4  r   r   r  r  r  r  r  rX   r  rP   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r'   ry   MraModel.forwardb  s    %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r2   )r  r  r  )NNNNNNNN)r   r   r   r   r   r  r  r  r   r   rR   rN  boolr   r   r   ry   r   r  r  s   @r'   r  r  H  s    /0C  -11515/3,004/3&*J
ELL)J
 !.J
 !.	J

 u||,J
 ELL)J
  -J
 'tnJ
 d^J
 
u88	9J
 J
r2   r  c                   Z  ^  \ rS rSrSS/rU 4S jrS rS r\         SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )MraForMaskedLMi  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r"   )r   r   r  r    r  clsr  r  s     r'   r   MraForMaskedLM.__init__  s4     F#!&) 	r2   c                 B    U R                   R                  R                  $ r"   )r  r  r  r  s    r'   get_output_embeddings$MraForMaskedLM.get_output_embeddings  s    xx##+++r2   c                     XR                   R                  l        UR                  U R                   R                  l        g r"   )r  r  r  r  )r  new_embeddingss     r'   set_output_embeddings$MraForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r2   r  r4  r   r   r  r  labelsr  r  rF  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr4  r   r   r  r  r  r  r   r;   r   losslogitsr3  r  )
r  r  r    r  r	   r)  r   r   r3  r  )r  r  r4  r   r   r  r  r  r  r  r=  r  r  masked_lm_lossloss_fctr   s                   r'   ry   MraForMaskedLM.forward  s    & &1%<k$++B]B](())%'!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r2   )r  r    	NNNNNNNNN)r   r   r   r   _tied_weights_keysr   r  r  r   r   rR   rN  r  r   r   r   ry   r   r  r  s   @r'   r  r    s   :<Z[,8  -11515/3,004)-/3&*0
ELL)0
 !.0
 !.	0

 u||,0
 ELL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 0
r2   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MraClassificationHeadi  z-Head for sentence-level classification tasks.c                 8  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        Xl        g r"   )r   r   r   r"  r   rC  r   r   r  
num_labelsout_projr  r  s     r'   r   MraClassificationHead.__init__  se    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr2   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  rC  r   r  rf  r
  )r  featureskwargsxs       r'   ry   MraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r2   )r  rC  r  r
  r  r  s   @r'   r  r    s    7 r2   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r"   )r   r   r	  r  r    r  
classifierr  r  s     r'   r   %MraForSequenceClassification.__init__  sA      ++F#/7 	r2   r  r4  r   r   r  r  r  r  r  rF  c
                 .   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                   U
R"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr;   r  )r  r  r    r  problem_typer	  rO   rR   rT   rC   r
   r1  r	   r)  r   r   r3  r  )r  r  r4  r   r   r  r  r  r  r  r=  r  r   r  r  r   s                   r'   ry   $MraForSequenceClassification.forward  s   & &1%<k$++B]B](())%'!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r2   )r  r    r	  r  )r   r   r   r   r   r   r   rR   rN  r  r   r   r   ry   r   r  r  s   @r'   r  r    s      -11515/3,004)-/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 A
r2   r  c                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForMultipleChoiceic  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        U R                  5         g rw  )
r   r   r  r    r   r"  r   pre_classifierr  r  r  s     r'   r   MraForMultipleChoice.__init__e  s_     F# ii(:(:F<N<NO))F$6$6: 	r2   r  r4  r   r   r  r  r  r  r  rF  c
                    U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   n
Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nUSS2S4   nU R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SU
5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r;   r8   r  r   r  )r  r  rQ   r)  r=   r    r   r   ReLUr  r	   r   r3  r  )r  r  r4  r   r   r  r  r  r  r  num_choicesr=  hidden_statepooled_outputr   reshaped_logitsr  r  r   s                      r'   ry   MraForMultipleChoice.forwardo  s   V &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  	
 qz$QT*++M:	-0/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r2   )r  r    r   r  )r   r   r   r   r   r   r   rR   rN  r  r   r   r   ry   r   r  r  s   @r'   r  r  c  s      -11515/3,004)-/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 'tnX
 d^X
 
u//	0X
 X
r2   r  c                   F  ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r"   )r   r   r	  r  r    r   r   r   r  r"  r   r  r  r  s     r'   r   "MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r2   r  r4  r   r   r  r  r  r  r  rF  c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " XR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r;   r   r  )r  r  r    r  r  r	   r)  r	  rR   wheretensorignore_indextype_asr   r3  r  )r  r  r4  r   r   r  r  r  r  r  r=  r  r   r  r  active_lossactive_logitsactive_labelsr   s                      r'   ry   !MraForTokenClassification.forward  sf   " &1%<k$++B]B](())%'!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r2   )r  r  r    r	  r  )r   r   r   r   r   r   r   rR   rN  r  r   r   r   ry   r   r  r  s   @r'   r*  r*    s    	  -11515/3,004)-/3&*9
ELL)9
 !.9
 !.	9

 u||,9
 ELL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 9
r2   r*  c                   f  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )MraForQuestionAnsweringi  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )Nr5   )
r   r   r	  r  r    r   r"  r   
qa_outputsr  r  s     r'   r    MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r2   r  r4  r   r   r  r  start_positionsend_positionsr  r  rF  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  pUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU
(       d  X4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r;   r9   )r0  r5   )r  start_logits
end_logitsr3  r  )r  r  r    r9  splitr1  r<   r=   clampr	   r   r3  r  )r  r  r4  r   r   r  r  r;  r<  r  r  r=  r  r   r>  r?  
total_lossignored_indexr  
start_lossend_lossr   s                         r'   ry   MraForQuestionAnswering.forward#  s    &1%<k$++B]B](())%'!5#  	
 "!*1#)<<r<#: #++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r2   )r    r	  r9  )
NNNNNNNNNN)r   r   r   r   r   r   r   rR   rN  r  r   r   r   ry   r   r  r  s   @r'   r7  r7    s   
  -11515/3,0042604/3&*<
ELL)<
 !.<
 !.	<

 u||,<
 ELL)<
  -<
 "%,,/<
  -<
 'tn<
 d^<
 
u22	3<
 <
r2   r7  )r  r  r7  r  r*  ru  r  r  r   )NN)r7   r   r   )Nr  r   pathlibr   typingr   r   r   rR   torch.utils.checkpointr   torch.nnr   r	   r
   torch.utils.cpp_extensionr   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_mrar   
get_loggerr   r  r/   r1   rL   r\   r`   rh   rr   autogradFunctionrt   r   r   r   r   r   Moduler   r  r@  rQ  rb  ro  ru  r  r  r  r  r  r  r  r  r  r  r*  r7  __all__r#   r2   r'   <module>rW     s      ) )    A A * !  . l l Y Y ( 
		H	%	C&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf7BII 7tYryy YzBII 299 Bbii  		 ryy :(
 (
X $")) 0!RYY ! * * *, d
! d
 d
N D
' D
 D
PBII * L
#5 L
L
^ d
- d
 d
N F
 2 F
 F
R J
0 J
 J
Z	r2   