
    fTh                     t   S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJrJrJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJrJrJr  SSKJrJ r J!r!J"r"  SSK#J$r$  \"RJ                  " \&5      r'Sq(S r)S r*S r+S r, " S S\RZ                  R\                  5      r/ " S S\RZ                  R\                  5      r0 " S S\
Rb                  5      r2 " S S\
Rb                  5      r3 " S S\
Rb                  5      r4 " S S\
Rb                  5      r5 " S S \
Rb                  5      r6 " S! S"\
Rb                  5      r7 " S# S$\
Rb                  5      r8 " S% S&\
Rb                  5      r9 " S' S(\
Rb                  5      r: " S) S*\
Rb                  5      r; " S+ S,\
Rb                  5      r<\ " S- S.\5      5       r=\ " S/ S0\=5      5       r>\ " S1 S2\=5      5       r? " S3 S4\
Rb                  5      r@\" S5S69 " S7 S8\=5      5       rA\ " S9 S:\=5      5       rB\ " S; S<\=5      5       rC\ " S= S>\=5      5       rD/ S?QrEg)@zPyTorch YOSO model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_ninja_availableis_torch_cuda_availablelogging   )
YosoConfigc                  B    SSK Jn   S nU" / SQ5      nU " SUSS9  SS Kqg )Nr   )loadc                     [        [        5      R                  5       R                  R                  R                  S-  S-  nU  Vs/ s H  o!U-  PM	     sn$ s  snf )Nkernelsyoso)r   __file__resolveparent)files
src_folderfiles      ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/yoso/modeling_yoso.pyappend_root&load_cuda_kernels.<locals>.append_root8   sL    (^++-44;;BBYNQWW
.34edT!e444s   A)zfast_lsh_cumulation_torch.cppzfast_lsh_cumulation.cuzfast_lsh_cumulation_cuda.cufast_lsh_cumulationT)verbose)torch.utils.cpp_extensionr   r+   lsh_cumulation)r   r)   	src_filess      r(   load_cuda_kernelsr0   4   s'    .5 vwI		480    c                     [        U [        5      (       aC  / nU  H9  nUR                  5       (       d  UR                  5       nUR	                  U5        M;     U$ U R                  5       (       d  U R                  5       n U $ N)
isinstancelistis_contiguous
contiguousappendinput_tensorsouttensors      r(   to_contiguousr=   C   sq    -&&#F''))**,JJv $ 
**,,)446Mr1   c           	          [        U [        5      (       a;  / nU  H1  nUR                  [        R                  R                  USSS95        M3     U$ [        R                  R                  U SSS9$ )N   )pdim)r4   r5   r8   r   
functional	normalizer9   s      r(   rD   rD   Q   sa    -&&#FJJr}}..v.CD $
}}&&}r&BBr1   c                 b   [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      e[        R                  " U R                  S5      U R                  S5      X#-  U R
                  S9nS[        R                  " X0R
                  S9-  n[        R                  " X5      R                  U R                  S5      U R                  S5      X#5      n[        R                  " X5      R                  UR                  S5      UR                  S5      X#5      nUS:  R                  5       nUS:  R                  5       n	[        R                  " X-  SS	9n
[        R                  " X-  SS	9n
U
R                  5       U
R                  5       4$ )
Nr   zQuery has incorrect size.zKey has incorrect size.r   r?   devicer   r@   rB   )lensize
ValueErrortorchrandnrG   arangematmulreshapeintsum)querykeynum_hashhash_lenrmat	raise_powquery_projectionkey_projectionquery_binary
key_binary
query_hashs              r(   hashingr^   [   sF   
5::<A455
388:!233;;uzz!}ejjmX5HQVQ]Q]^DU\\(<<@@I||E088A

STW_j\\#,44SXXa[#((1+xbN$q(--/L 1$))+J<3<J:1r:J>>Z^^---r1   c                   4    \ rS rSr\S 5       r\S 5       rSrg)YosoCumulationn   c           
      F   US   nS[         R                  " [         R                  " X4R                  SS5      5      5      [        R
                  -  -
  U-  nXS S 2S S 2S 4   -  US S 2S S S 24   -  n[         R                  " X5      n	U R                  XXXE5        X`l        U	$ )Nhash_code_lenr   r@   )rL   acosrO   	transposemathpisave_for_backwardconfig)
ctx
query_maskkey_maskrS   rT   valuerj   rc   expectationcumulation_values
             r(   forwardYosoCumulation.forwardo   s    /5::ell5--B:O&PQTXT[T[[[`mm!q!Tz$::Xaqj=QQ <<;jKS
r1   c                    [        U5      nU R                  u  p#pEpgU R                  nUS   n	[        R                  " XR                  SS5      5      U-  n
[        R                  " XS-  U-  5      n[        R                  " U
R                  SS5      U	S-  U-  5      n[        R                  " UR                  SS5      U5      nS S XUS 4$ )Nrc   r@   rd   r?   )r=   saved_tensorsrj   rL   rO   rf   )rk   gradrl   rm   ro   rS   rT   rn   rj   rc   weighted_exp
grad_querygrad_key
grad_values                 r(   backwardYosoCumulation.backward|   s    T"?B?P?P<
k#/||D//"b*AB[P\\,1Bc0IJ
<< 6 6r2 >QRARV[@[\\\+"7"7B"?F
T:TAAr1    N__name__
__module____qualname____firstlineno__staticmethodrq   rz   __static_attributes__r|   r1   r(   r`   r`   n   s*    
  
  B Br1   r`   c                   4    \ rS rSr\S 5       r\S 5       rSrg)YosoLSHCumulation   c           
      t   UR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S	5      e[        XX4U/5      u  pp4nUR                  nUS
   nUS   n	[	        SU	-  5      n
US   (       a  [
        R                  XX$XUS5      u  pO[        X4X5      u  p[
        R                  XX,XZUS5      nU R                  XXX4U5        X`l	        U$ )Nr   z6Query mask and Key mask differ in sizes in dimension 0z3Query mask and Query differ in sizes in dimension 0z1Query mask and Key differ in sizes in dimension 0z8Query mask and Value mask differ in sizes in dimension 0r   z,Key and Value differ in sizes in dimension 1r?   z,Query and Key differ in sizes in dimension 2rU   rc   use_fast_hash)
rJ   rK   r=   is_cudarQ   r.   	fast_hashr^   ri   rj   )rk   rl   rm   rS   rT   rn   rj   use_cudarU   rc   hashtable_capacityquery_hash_codekey_hash_coderp   s                 r(   rq   YosoLSHCumulation.forward   s   ??1q!11UVV??1A.RSS??1!,PQQ??1A.WXX88A;%**Q-'KLL::a=CHHQK'KLL2?W\ch@i2j/
e%%%*%/ M!12/"-;-E-E8(8UV.*O] .5U-Y*O)88%]egh
 	jOTY`ef
r1   c                    [        U5      nU R                  u  p#pEpgnU R                  n	UR                  n
U	S   n[	        SU-  5      nU	S   (       ac  [
        R                  X5X$XU
S5      n[
        R                  UUUUUUUS-  U-  UU
S5
      n[
        R                  UUUUUUUS-  U-  UU
S5
      nGOS[        R                  " [        R                  " XgR                  SS5      5      5      [        R                  -  -
  U-  nUUS S 2S S 2S 4   -  US S 2S S S 24   -  n[        R                  " XR                  SS5      5      U-  n[        R                  " UUS-  U-  5      n[        R                  " UR                  SS5      US-  U-  5      n[        R                  " UR                  SS5      U5      nS S XUS 4$ )Nrc   r?   lsh_backwardr      r@   rd   )r=   rt   rj   r   rQ   r.   lsh_weighted_cumulationrL   re   rO   rf   rg   rh   )rk   ru   rl   rm   r   r   rS   rT   rn   rj   r   rc   r   ry   rw   rx   ro   rv   s                     r(   rz   YosoLSHCumulation.backward   s   T"RURcRcO
oe%<</ M!12.!'66d`hjkJ (??"c)"J &=="e+"H uzz%,,ummBPR>S*TUX\X_X___dqqK%
1a:(>>!TST*AUUK <<oob".EFTLl]Q5F#4MNJ||L$:$:2r$B]UVEVZ_D_`Hk&;&;B&CTJJT:TAAr1   r|   Nr}   r|   r1   r(   r   r      s+    #  # J .B .Br1   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )YosoEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c           	      d  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  S-   UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      S-   SS9  [+        USS	5      U l        U R#                  S
[$        R.                  " U R0                  R3                  5       [$        R4                  U R0                  R6                  S9SS9  g )N)padding_idxr?   epsposition_ids)r   r@   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtyperG   )super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrL   rN   expandgetattrr   zerosr   rJ   longrG   selfrj   	__class__s     r(   r   YosoEmbeddings.__init__   sL   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWX[\\in 	 	
 (/v7PR\']$KK))..0

4K\K\KcKcd 	 	
r1   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr@   r   r   r   r   r   )rJ   r   hasattrr   r   rL   r   r   rG   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r(   rq   YosoEmbeddings.forward   s:    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r1   )r   r   r   r   r   r   )NNNN	r~   r   r   r   __doc__r   rq   r   __classcell__r   s   @r(   r   r      s    Q
,   r1   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )YosoSelfAttentioni#  c           	        > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      e[        S Ln[        5       (       a!  [        5       (       a  U(       d   [        5         UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R*                  " UR,                  5      U l        Ub  UOUR0                  U l        UR2                  U l        UR4                  U l        UR6                  S LU l        UR:                  U l        UR<                  U l        UR>                  U l        U R4                  U R:                  U R<                  U R>                  S.U l         UR6                  bX  [         RB                  " UR                  UR                  UR6                  S4UR6                  S	-  S4S
UR                  S9U l"        g g ! [         a#  n[        R                  SU 35         S nAGNKS nAff = f)Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: )rc   r   rU   r   r   r?   F)in_channelsout_channelskernel_sizepaddingbiasgroups)#r   r   r   num_attention_headsr   rK   r.   r   r   r0   	ExceptionloggerwarningrQ   attention_head_sizeall_head_sizer   LinearrS   rT   rn   r   attention_probs_dropout_probr   r   use_expectationrc   conv_windowuse_convr   rU   r   
lsh_configConv2dconv)r   rj   r   kernel_loadeder   s        r(   r   YosoSelfAttention.__init__$  sx    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  'd2"$$);)=)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$  &55#11**$6#11"// "//!// --	
 )		"66#77#//3++q0!411DI *=  n!hijhklmmns   
J? ?
K,	K''K,c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr@   r   r?   r   r   )rJ   r   r   viewpermute)r   layernew_layer_shapes      r(   transpose_for_scores&YosoSelfAttention.transpose_for_scoresW  sM    **,s+t/G/GIaIa.bb

O,}}Q1a((r1   c                 @   U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nU R                  (       a  U R                  XbS S 2S S S 2S 4   -  5      nUR                  5       u  ppUR                  X-  X5      nUR                  X-  X5      nUR                  X-  X5      nSUS-  -   nUR                  S5      R                  U
SS9R                  X-  U5      R                  5       nSnU R                  (       d  X:  a  X-  XU-
  4n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9nU R                  (       d  U R                   (       a  [#        Xu/5      u  puU R                  (       a"  [$        R'                  X"XuX`R(                  5      nO![*        R'                  X"XuX`R(                  5      nU R                  (       d  X:  a  US S 2S S 2S U24   n[#        U5      nUR                  XX5      nU R                  (       a  UW-  nUR-                  SS	SS
5      R/                  5       nUR                  5       S S U R0                  4-   nUR2                  " U6 nU(       a  X4nU$ U4nU$ )N      ?g     @r   rH       rF   r@   r   r?   r   rd   )rS   r   rT   rn   r   r   rJ   rP   	unsqueezerepeat_interleaverQ   r   rL   catr   rG   trainingrD   r`   applyr   r   r   r7   r   r   )r   hidden_statesattention_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerconv_value_layer
batch_size	num_headsseq_lenhead_dimgpu_warp_sizepad_sizecontext_layernew_context_layer_shapeoutputss                     r(   rq   YosoSelfAttention.forward\  s    JJ}5--dhh}.EF	//

=0IJ//0AB==#yyaqRVFV7W)WX3>3C3C3E0
w!))**@'T%%j&<gP	!))**@'T~77$$Q'ya0WZ+W5SU	 	 $$(*B!-w8PPH))KK1C1CD K 		KK1A1AB I  ))KK1C1CD K 4==%./G%H"K*00UdUdM .33UdUdM $$(*B)!Q		/:M!-0%--jWW==--M%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD4E=0 MZK[r1   )r   r   r   r   rc   rT   r   r   r   rU   r   rS   r   r   r   rn   r3   NF)	r~   r   r   r   r   r   rq   r   r   r   s   @r(   r   r   #  s    1f)
Q Qr1   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )YosoSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r   r   r   r   r   denser   r   r   r   r   r   s     r(   r   YosoSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r   input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r3   r  r   r   r   r   r  s      r(   rq   YosoSelfOutput.forward  5    

=1]3}'CDr1   r   r  r   
r~   r   r   r   r   rL   Tensorrq   r   r   r   s   @r(   r	  r	    6    >U\\  RWR^R^  r1   r	  c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )YosoAttentioni  c                 |   > [         TU ]  5         [        XS9U l        [	        U5      U l        [        5       U l        g )N)r   )r   r   r   r   r	  outputsetpruned_heads)r   rj   r   r   s      r(   r   YosoAttention.__init__  s0    %f^	$V,Er1   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rH   )rI   r   r   r   r   r  r   rS   rT   rn   r  r  r   union)r   headsindexs      r(   prune_headsYosoAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r1   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r  )r   r   r   r   self_outputsattention_outputr  s          r(   rq   YosoAttention.forward  sA    yy@QR;;|AF#%QR(88r1   )r  r  r   r3   r  )	r~   r   r   r   r   r$  rq   r   r   r   s   @r(   r  r    s    ";$ r1   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r3   )r   r   r   r   r   intermediate_sizer  r4   
hidden_actstrr   intermediate_act_fnr   s     r(   r   YosoIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r1   r   r  c                 J    U R                  U5      nU R                  U5      nU$ r3   r  r0  r   r   s     r(   rq   YosoIntermediate.forward  s&    

=100?r1   r3  r  r   s   @r(   r+  r+    s(    9U\\ ell  r1   r+  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
YosoOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r   r   r   r   r-  r   r  r   r   r   r   r   r   s     r(   r   YosoOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r   r  r  c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r3   r  r  s      r(   rq   YosoOutput.forward  r  r1   r  r  r   s   @r(   r7  r7    r  r1   r7  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )	YosoLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        [        U5      U l        [        U5      U l
        g Nr   )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionadd_cross_attentionr+  intermediater7  r  r   s     r(   r   YosoLayer.__init__  sW    '-'E'E$&v.#)#=#= ,V4 (r1   c                     U R                  XUS9nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ )N)r   r   r   )rB  r   feed_forward_chunkr@  rA  )r   r   r   r   self_attention_outputsr(  r  layer_outputs           r(   rq   YosoLayer.forward  sf    !%ar!s1!4(,0##T%A%A4CSCSUe
  /G+r1   c                 J    U R                  U5      nU R                  X!5      nU$ r3   )rD  r  )r   r(  intermediate_outputrI  s       r(   rG  YosoLayer.feed_forward_chunk  s)    "//0@A{{#6Ir1   )rC  rB  r@  rD  r  rA  r  )	r~   r   r   r   r   rq   rG  r   r   r   s   @r(   r=  r=    s    ) r1   r=  c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )YosoEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
r   r   rj   r   
ModuleListrangenum_hidden_layersr=  r   gradient_checkpointing)r   rj   _r   s      r(   r   YosoEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hq  u  pU(       a  Xq4-   nU R                  (       a0  U R                  (       a  U R	                  U
R
                  UUU5      nO	U
" XU5      nUS   nU(       d  Mi  XS   4-   nMs     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr|   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr3   r|   ).0vs     r(   	<genexpr>&YosoEncoder.forward.<locals>.<genexpr>D  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater   rT  r   _gradient_checkpointing_func__call__tupler   )r   r   r   	head_maskr   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_outputss               r(   rq   YosoEncoder.forward"  s     #7BD$5b4(4OA#$58H$H!**t}} $ A A ))!"%	! !-]L] ^)!,M  &91=M<O&O#!  5$   14D Dm]GZ$[mmm1++*
 	
r1   )rj   rT  r   )NNFFT)r~   r   r   r   r   rq   r   r   r   s   @r(   rO  rO    s#    , "'
 '
r1   rO  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoPredictionHeadTransformiM  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r  )r   r   r   r   r   r  r4   r.  r/  r   transform_act_fnr   r   r   s     r(   r   $YosoPredictionHeadTransform.__init__N  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr1   r   r  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r3   )r  ro  r   r4  s     r(   rq   #YosoPredictionHeadTransform.forwardW  s4    

=1--m<}5r1   )r   r  ro  r  r   s   @r(   rm  rm  M  s)    UU\\ ell  r1   rm  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )YosoLMPredictionHeadi_  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)r   )r   r   rm  	transformr   r   r   r   decoder	ParameterrL   r   r   r   s     r(   r   YosoLMPredictionHead.__init__`  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr1   c                 :    U R                   U R                  l         g r3   )r   rw  r   s    r(   _tie_weights!YosoLMPredictionHead._tie_weightsm  s     IIr1   c                 J    U R                  U5      nU R                  U5      nU$ r3   )rv  rw  r4  s     r(   rq   YosoLMPredictionHead.forwardp  s$    }5]3r1   )r   rw  rv  )	r~   r   r   r   r   r|  rq   r   r   r   s   @r(   rt  rt  _  s    && r1   rt  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoOnlyMLMHeadiw  c                 B   > [         TU ]  5         [        U5      U l        g r3   )r   r   rt  predictionsr   s     r(   r   YosoOnlyMLMHead.__init__x  s    /7r1   sequence_outputr  c                 (    U R                  U5      nU$ r3   r  )r   r  prediction_scoress      r(   rq   YosoOnlyMLMHead.forward|  s     ,,_=  r1   r  r  r   s   @r(   r  r  w  s(    8!u|| ! ! !r1   r  c                   &    \ rS rSr\rSrSrS rSr	g)YosoPreTrainedModeli  r!   Tc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNr   )r4   r   r   weightdatanormal_rj   initializer_ranger   zero_r   r   r   fill_)r   modules     r(   _init_weights!YosoPreTrainedModel._init_weights  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r1   r|   N)
r~   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r|   r1   r(   r  r    s    L&*#*r1   r  c                   D  ^  \ rS rSrU 4S jrS rS rS r\         SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )	YosoModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r3   )r   r   rj   r   r   rO  encoder	post_initr   s     r(   r   YosoModel.__init__  s9     (0"6* 	r1   c                 .    U R                   R                  $ r3   r   r   r{  s    r(   get_input_embeddingsYosoModel.get_input_embeddings  s    ...r1   c                 $    XR                   l        g r3   r  )r   rn   s     r(   set_input_embeddingsYosoModel.set_input_embeddings  s    */'r1   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r   rB  r$  )r   heads_to_pruner   r"  s       r(   _prune_headsYosoModel._prune_heads  s<    
 +002LELLu%//;;EB 3r1   r   r   r   r   rc  r   r   rd  re  r  c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  XPR                   R"                  5      nU R                  UUUUS9nU R%                  UUUUUU	S9nUS	   nU	(       d	  U4US
S  -   $ ['        UUR(                  UR*                  UR,                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer@   z5You have to specify either input_ids or inputs_embedsrF   r   r   )r   r   r   r   )r   rc  r   rd  re  r   r   )r]  r   r^  cross_attentions)rj   r   rd  use_return_dictrK   %warn_if_padding_and_no_attention_maskrJ   rG   rL   onesr   r   r   r   r   r   get_head_maskrS  r  r   r   r^  r  )r   r   r   r   r   rc  r   r   rd  re  r   r   r   rG   r   r   embedding_outputencoder_outputsr  s                      r(   rq   YosoModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z &&y++2O2OP	??%)'	 + 
 ,,)/!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r1   )rj   r   r  )	NNNNNNNNN)r~   r   r   r   r   r  r  r  r   r   rL   r  boolr   r   r   rq   r   r   r   s   @r(   r  r    s   /0C  -11515/3,004,0/3&*I
ELL)I
 !.I
 !.	I

 u||,I
 ELL)I
  -I
 $D>I
 'tnI
 d^I
 
u88	9I
 I
r1   r  c                   f  ^  \ rS rSrSS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )YosoForMaskedLMi  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r3   )r   r   r  r!   r  clsr  r   s     r(   r   YosoForMaskedLM.__init__  s4     f%	"6* 	r1   c                 B    U R                   R                  R                  $ r3   )r  r  rw  r{  s    r(   get_output_embeddings%YosoForMaskedLM.get_output_embeddings  s    xx##+++r1   c                     XR                   R                  l        UR                  U R                   R                  l        g r3   )r  r  rw  r   )r   new_embeddingss     r(   set_output_embeddings%YosoForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r1   r   r   r   r   rc  r   labelsr   rd  re  r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr   r   r   rc  r   r   rd  re  r   r@   r   losslogitsr   r^  )
rj   r  r!   r  r	   r   r   r   r   r^  )r   r   r   r   r   rc  r   r  r   rd  re  r  r  r  masked_lm_lossloss_fctr  s                    r(   rq   YosoForMaskedLM.forward  s    ( &1%<k$++B]B]))))%'/!5#  

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r1   )r  r!   
NNNNNNNNNN)r~   r   r   r   _tied_weights_keysr   r  r  r   r   rL   r  r  r   r   r   rq   r   r   r   s   @r(   r  r    s   :<Z[,8  -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
r1   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )YosoClassificationHeadiI  z-Head for sentence-level classification tasks.c                 8  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        Xl        g r3   )r   r   r   r   r   r  r   r   r   
num_labelsout_projrj   r   s     r(   r   YosoClassificationHead.__init__L  se    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr1   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  r   rj   r.  r  )r   featureskwargsxs       r(   rq   YosoClassificationHead.forwardT  se    Q1WLLOJJqM4;;))*1-LLOMM!r1   )rj   r  r   r  r   r   s   @r(   r  r  I  s    7 r1   r  z
    YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )YosoForSequenceClassificationi^  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r3   )r   r   r  r  r!   r  
classifierr  r   s     r(   r   &YosoForSequenceClassification.__init__e  sA      ++f%	08 	r1   r   r   r   r   rc  r   r  r   rd  re  r  c                 0   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr@   r  )rj   r  r!   r  problem_typer  r   rL   r   rQ   r
   squeezer	   r   r   r   r   r^  )r   r   r   r   r   rc  r   r  r   rd  re  r  r  r  r  r  r  s                    r(   rq   %YosoForSequenceClassification.forwardn  s   ( &1%<k$++B]B]))))%'/!5#  

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r1   )r  r  r!   r  )r~   r   r   r   r   r   r   rL   r  r  r   r   r   rq   r   r   r   s   @r(   r  r  ^  s     -11515/3,004)-,0/3&*C
ELL)C
 !.C
 !.	C

 u||,C
 ELL)C
  -C
 &C
 $D>C
 'tnC
 d^C
 
u..	/C
 C
r1   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )YosoForMultipleChoicei  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r?  )
r   r   r  r!   r   r   r   pre_classifierr  r  r   s     r(   r   YosoForMultipleChoice.__init__  s_     f%	 ii(:(:F<N<NO))F$6$6: 	r1   r   r   r   r   rc  r   r  r   rd  re  r  c                    U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nUSS2S4   nU R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r@   rd   r  r   r  )rj   r  shaper   rJ   r!   r  r   ReLUr  r	   r   r   r^  )r   r   r   r   r   rc  r   r  r   rd  re  num_choicesr  hidden_statepooled_outputr  reshaped_logitsr  r  r  s                       r(   rq   YosoForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

 qz$QT*++M:	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r1   )r  r  r!   r  )r~   r   r   r   r   r   r   rL   r  r  r   r   r   rq   r   r   r   s   @r(   r  r    s     -11515/3,004)-,0/3&*Z
ELL)Z
 !.Z
 !.	Z

 u||,Z
 ELL)Z
  -Z
 &Z
 $D>Z
 'tnZ
 d^Z
 
u//	0Z
 Z
r1   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )YosoForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r3   )r   r   r  r  r!   r   r   r   r   r   r   r  r  r   s     r(   r   #YosoForTokenClassification.__init__!  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r1   r   r   r   r   rc  r   r  r   rd  re  r  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r@   r   r  )rj   r  r!   r   r  r	   r   r  rL   wherer<   ignore_indextype_asr   r   r^  )r   r   r   r   r   rc  r   r  r   rd  re  r  r  r  r  r  active_lossactive_logitsactive_labelsr  s                       r(   rq   "YosoForTokenClassification.forward,  sk   $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r1   )r  r   r  r!   r  )r~   r   r   r   r   r   r   rL   r  r  r   r   r   rq   r   r   r   s   @r(   r  r    s    	  -11515/3,004)-,0/3&*;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 &;
 $D>;
 'tn;
 d^;
 
u++	,;
 ;
r1   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\4   4S jj5       rSrU =r$ )YosoForQuestionAnsweringik  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )Nr?   )
r   r   r  r  r!   r   r   r   
qa_outputsr  r   s     r(   r   !YosoForQuestionAnswering.__init__m  s[      ++f%	))F$6$68I8IJ 	r1   r   r   r   r   rc  r   start_positionsend_positionsr   rd  re  r  c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r@   rH   )r  r?   )r  start_logits
end_logitsr   r^  )rj   r  r!   r  splitr  rI   rJ   clampr	   r   r   r^  )r   r   r   r   r   rc  r   r  r  r   rd  re  r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r(   rq    YosoForQuestionAnswering.forwardy  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r1   )r  r  r!   )NNNNNNNNNNN)r~   r   r   r   r   r   r   rL   r  r  r   r   r   rq   r   r   r   s   @r(   r   r   k  s   
  -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r1   r   )r  r  r   r  r  r=  r  r  )Fr   rg   pathlibr   typingr   r   r   rL   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_yosor   
get_loggerr~   r   r.   r0   r=   rD   r^   autogradFunctionr`   r   Moduler   r   r	  r  r+  r7  r=  rO  rm  rt  r  r  r  r  r  r  r  r  r   __all__r|   r1   r(   <module>r     s_      ) )    A A !  . l l  + 
		H	% 1C.&BU^^,, B>VB// VBt9RYY 9xJ		 J\RYY BII Bryy   		 :.
")) .
d")) $299 0!bii ! */ * *, c
# c
 c
L F
) F
 F
RRYY * N
$7 N
N
b f
/ f
 f
R H
!4 H
 H
V L
2 L
 L
^	r1   