
    fThW                       S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJrJrJr  SSKJrJr  SS	KJr  SS
KJrJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'J(r(J)r)  SSK*J+r+  \)RX                  " \-5      r.SES jr/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S\R`                  5      r3S\20r4 " S S\R`                  5      r5 " S S\R`                  5      r6 " S S\R`                  5      r7 " S S\R`                  5      r8 " S  S!\R`                  5      r9 " S" S#\R`                  5      r: " S$ S%\R`                  5      r;\( " S& S'\!5      5       r<\ " S( S)\'5      5       r=\( " S* S+\<5      5       r> " S, S-\R`                  5      r? " S. S/\R`                  5      r@\(" S0S19 " S2 S3\<5      5       rA\(" S4S19 " S5 S6\<5      5       rB\(" S7S19 " S8 S9\<5      5       rC\(" S:S19 " S; S<\<5      5       rD\( " S= S>\<5      5       rE\( " S? S@\<5      5       rF\(" SAS19 " SB SC\<\5      5       rG/ SDQrHg)FzPyTorch ELECTRA model.    N)	dataclass)CallableListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)GenerationMixin)"BaseModelOutputWithCrossAttentions)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )ElectraConfigc                 >    SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n	/ n
U H]  u  p[        R                  SU SU 35        UR                  R                  X{5      nU	R                  U5        U
R                  U5        M_     [        X5       GH;  u  pUn [        U [         5      (       a  UR#                  SS5      nUS	:X  a$  UR#                  S
S5      nUR#                  SS
5      nUR#                  SS5      nUR#                  SS5      nUR%                  S5      n['        S U 5       5      (       a  [        R                  SU 35        M  U nU H  nUR)                  SU5      (       a  UR%                  SU5      nOU/nUS   S:X  d	  US   S:X  a  [+        US5      nOZUS   S:X  d	  US   S:X  a  [+        US5      nO;US   S:X  a  [+        US5      nO%US   S:X  a  [+        US5      nO[+        UUS   5      n[-        U5      S:  d  M  [/        US    5      nUU   nM     WR1                  S!5      (       a  [+        US5      nOUS:X  a  UR3                  U5      n UR4                  UR4                  :w  a&  [7        S"UR4                   S#UR4                   S$35      e [;        S%U 3U5        [<        R>                  " U5      Ul         GM>     U $ ! [         a    [        R                  S5        e f = f! [6         a1  nU=R8                  UR4                  UR4                  4-  sl        e SnAff = f! [B         a  n[;        SU 3UU5         SnAGM  SnAff = f)&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zelectra/embeddings/zgenerator/embeddings/	generatorzelectra/zdiscriminator/z
generator/dense_1dense_predictionz!generator_predictions/output_biaszgenerator_lm_head/bias/c              3   *   #    U  H	  oS ;   v   M     g7f))global_steptemperatureN ).0ns     d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/electra/modeling_electra.py	<genexpr>-load_tf_weights_in_electra.<locals>.<genexpr>^   s     E166s   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r    _embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )"renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzip
isinstanceElectraForMaskedLMreplacesplitany	fullmatchgetattrlenintendswith	transposeshape
ValueErrorargsprinttorch
from_numpydataAttributeError)modelconfigtf_checkpoint_pathdiscriminator_or_generatorr;   nptftf_path	init_varsnamesarraysnamerU   arrayoriginal_namepointerm_namescope_namesnumes                       r-   load_tf_weights_in_electraro   4   so   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 !
 5)!3	%!344||$9;RS)[8||J0@A||L*=<<	+=>D<< CE]^D::c?D EEEEi78G<< 0&99"$((9f"=K#)(Kq>X-Q71J%gx8G ^}4A&8P%gv6G ^'77%gx8G ^w.%g|<G%g{1~>G{#q(k!n-C%clG# $ }--!'848#U+==EKK/$~gmm_DUV[VaVaUbbm%noo 0
 .tf5}E ++E2GLg *n LQ  Q	
 	@  7==%++66
  	Im_-tQ7	sW   L B5M6B:M6AM6"A L8"+M6!L58
M3,M..M33M66
N NNc                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )ElectraEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  [+        USS5      U l        U R#                  S	[$        R.                  " U R0                  R3                  5       [$        R4                  S
9SS9  g )N)padding_idxepsposition_ids)r    F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r	   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrY   arangeexpandrP   rz   zerosrw   sizelongselfr^   	__class__s     r-   r   ElectraEmbeddings.__init__   s1   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsr|   rw   inputs_embedspast_key_values_lengthreturnc                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nrx   r    r|   r   r~   devicer{   )r   rw   hasattrr|   r   rY   r   r   r   r   r   rz   r   r   r   )r   r   r|   rw   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r-   forwardElectraEmbeddings.forward   sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r   )r   r   rz   r   r   r   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r   r   rY   
LongTensorFloatTensorrR   Tensorr   __static_attributes____classcell__r   s   @r-   rq   rq      s    Q
. 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r   rq   c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )ElectraSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()rz   r{   relative_keyrelative_key_queryr9   r    )r   r   hidden_sizenum_attention_headsr   rV   rR   attention_head_sizeall_head_sizer	   Linearquerykeyvaluer   attention_probs_dropout_probr   rP   rz   r   r   distance_embedding
is_decoderr   r^   rz   r   s      r-   r   ElectraSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r   xr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nrx   r   r9   r    r   )r   r   r   viewpermute)r   r   new_x_shapes      r-   transpose_for_scores)ElectraSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r    r9   dimrx   r   r   r   r}   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r   r   r   rY   catr   matmulrT   rz   rU   tensorr   r   r   r   r   r   tor~   einsummathsqrtr   r	   
functionalsoftmaxr   r   
contiguousr   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r-   r   ElectraSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr   )r   r   r   r   r   r   r   r   rz   r   r   NNNNNNF)r   r   r   r   r   rY   r   r   r   r   r   boolr   r   r   r   s   @r-   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ElectraSelfOutputiP  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nru   )r   r   r	   r   r   denser   r   r   r   r   r   s     r-   r   ElectraSelfOutput.__init__Q  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   r   r   r   r  s      r-   r   ElectraSelfOutput.forwardW  5    

=1]3}'CDr   r   r   r   
r   r   r   r   r   rY   r   r   r   r   r   s   @r-   r   r   P  6    >U\\  RWR^R^  r   r   eagerc                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )ElectraAttentionid  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nrz   )	r   r   ELECTRA_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputsetpruned_headsr   s      r-   r   ElectraAttention.__init__e  s@    263N3NO
	 (/Er   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r    r   )rQ   r   r   r   r   r  r   r   r   r   r  r   r   union)r   headsindexs      r-   prune_headsElectraAttention.prune_headsm  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r    )r   r  )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r-   r   ElectraAttention.forward  sW     yy!"
  ;;|AF#%QR(88r   )r  r  r   r   r   )r   r   r   r   r   r  rY   r   r   r   r   r   r   r   r   r   s   @r-   r  r  d  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ElectraIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r	   r   r   intermediate_sizer   rJ   
hidden_actstrr   intermediate_act_fnr   s     r-   r   ElectraIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r&  )r   r   s     r-   r   ElectraIntermediate.forward  s&    

=100?r   r)  r	  r   s   @r-   r!  r!    s(    9U\\ ell  r   r!  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ElectraOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r   r   r	   r   r#  r   r   r   r   r   r   r   r   s     r-   r   ElectraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r  s      r-   r   ElectraOutput.forward  r  r   r  r	  r   s   @r-   r,  r,    r
  r   r,  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )ElectraLayeri  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr    z> should be used as a decoder model if cross attention is addedr{   r  )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrV   crossattentionr!  intermediater,  r  r   s     r-   r   ElectraLayer.__init__  s    '-'E'E$)&1 ++#)#=#= ##?? D6)g!hii"26S]"^D/7#F+r   r   r   r   r   r   r   r   r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr9   )r   r   r   r    rx   r8  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r6  r   r   rV   r8  r   feed_forward_chunkr4  r5  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r-   r   ElectraLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r9  r  )r   r  intermediate_outputrC  s       r-   r<  ElectraLayer.feed_forward_chunk  s)    "//0@A{{#6Ir   )r7  r6  r4  r8  r9  r   r  r5  r   )r   r   r   r   r   rY   r   r   r   r   r   r   r<  r   r   r   s   @r-   r2  r2    s    ," 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r   r2  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )ElectraEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r^   r	   
ModuleListrangenum_hidden_layersr2  layergradient_checkpointing)r   r^   _r   s      r-   r   ElectraEncoder.__init__  sR    ]]%H`H`Ba#bBaQL$8Ba#bc
&+# $cs   A&r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
Nr*   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   rx   r    r9   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r*   )r+   vs     r-   r.   )ElectraEncoder.forward.<locals>.<genexpr>W  s"      
A  s   	)last_hidden_staterR  r   
attentionscross_attentions)r^   r7  rO  trainingr?   warning_once	enumeraterN  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   rR  r   r   rS  rT  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r-   r   ElectraEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r   )r^   rO  rN  )	NNNNNNFFT)r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   s   @r-   rI  rI    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r   rI  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraDiscriminatorPredictionsik  zEPrediction module for the discriminator, made up of two dense layers.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  5      U l        [        R                  " UR                  S5      U l	        Xl
        g Nr    )r   r   r	   r   r   r   r   r$  
activationr%   r^   r   s     r-   r   (ElectraDiscriminatorPredictions.__init__n  s\    YYv1163E3EF
():):; "		&*<*<a @r   c                     U R                  U5      nU R                  U5      nU R                  U5      R                  S5      nU$ )Nrx   )r   ro  r%   squeeze)r   discriminator_hidden_statesr   logitss       r-   r   'ElectraDiscriminatorPredictions.forwardv  s?    

#>?6&&}5==bAr   )ro  r^   r   r%   	r   r   r   r   r   r   r   r   r   r   s   @r-   rl  rl  k  s    O r   rl  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraGeneratorPredictionsi~  zAPrediction module for the generator, made up of two dense layers.c                    > [         TU ]  5         [        S5      U l        [        R
                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )Ngeluru   )r   r   r   ro  r	   r   r   r   r   r   r   r   s     r-   r   $ElectraGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   ro  r   )r   generator_hidden_statesr   s      r-   r   #ElectraGeneratorPredictions.forward  s3    

#:;6}5r   )r   ro  r   rv  r   s   @r-   rx  rx  ~  s    KJ r   rx  c                   *    \ rS rSr\r\rSrSr	S r
Srg)ElectraPreTrainedModeli  electraTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)rJ   r	   r   r2   r[   normal_r^   initializer_ranger5   zero_r   rt   r   fill_)r   modules     r-   _init_weights$ElectraPreTrainedModel._init_weights  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r   r*   N)r   r   r   r   r!   config_classro   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r   r*   r   r-   r  r    s     L0O!&*#*r   r  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	ElectraForPreTrainingOutputi  a  
Output type of [`ElectraForPreTraining`].

Args:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossrt  r   rZ  r*   )r   r   r   r   r   r  r   rY   r   __annotations__rt  r   r   rZ  r   r*   r   r-   r  r    sg    * )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r   r  c                      ^  \ rS rSrU 4S jrS rS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )ElectraModeli  c                 $  > [         TU ]  U5        [        U5      U l        UR                  UR
                  :w  a0  [        R                  " UR                  UR
                  5      U l        [        U5      U l
        Xl        U R                  5         g r   )r   r   rq   r   r   r   r	   r   embeddings_projectrI  encoderr^   	post_initr   s     r-   r   ElectraModel.__init__  sj     +F3  F$6$66&(ii0E0EvGYGY&ZD#%f-r   c                 .    U R                   R                  $ r   r   r   r   s    r-   get_input_embeddings!ElectraModel.get_input_embeddings  s    ...r   c                 $    XR                   l        g r   r  )r   r   s     r-   set_input_embeddings!ElectraModel.set_input_embeddings  s    */'r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rN  r6  r  )r   heads_to_prunerN  r  s       r-   _prune_headsElectraModel._prune_heads  s<    
 +002LELLu%//;;EB 3r   r   r   r|   rw   r   r   r   r   rR  r   r   rS  rT  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[	        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUS9nUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR                  UU5      nUnO$[        R                  " U[        R                   US9nU R#                  X.5      nU R                   R$                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R'                  U5      nOS nU R)                  XPR                   R*                  5      nU R                  UUUUUS	9n[        U S
5      (       a  U R-                  U5      nU R/                  UUUUUU	U
UUUS9
nU$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timerx   z5You have to specify either input_ids or inputs_embedsr   r9   )r   r|   r   )r   rw   r|   r   r   r  )	r   r   r   r   rR  r   r   rS  rT  )r^   r   rS  use_return_dictrV   %warn_if_padding_and_no_attention_maskr   r   rU   rY   onesr   r   r|   r   r   r   get_extended_attention_maskr   invert_attention_maskget_head_maskrM  r  r  )r   r   r   r|   rw   r   r   r   r   rR  r   r   rS  rT  r   
batch_sizer   r   r   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthrP  encoder_hidden_shapeencoder_extended_attention_maskr   s                               r-   r   ElectraModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZFCN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>"_ ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+&&y++2O2OP	%)'#9 ( 
 4-.. 33MBM2"7#B+/!5# % 
 r   )r^   r   r  r  )NNNNNNNNNNNNN)r   r   r   r   r   r  r  r  r   r   rY   r   r   r   r   r   r   r   r   r   r   r   s   @r-   r  r    sk   
/0C  -11515/3,0048<9==A$(,0/3&*WELL)W !.W !.	W
 u||,W ELL)W  -W  (5W !) 6W "$u'8'8"9:W D>W $D>W 'tnW d^W 
uU\\"$FF	GW Wr   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraClassificationHeadi<  z-Head for sentence-level classification tasks.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        S5      U l	        [        R                  " U5      U l        [        R                  " UR                  UR                  5      U l        g )Nrz  )r   r   r	   r   r   r   classifier_dropoutr   r   ro  r   r   
num_labelsout_projr   r^   r  r   s      r-   r   "ElectraClassificationHead.__init__?  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 )0zz"45		&"4"4f6G6GHr   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   ro  r  )r   featureskwargsr   s       r-   r   !ElectraClassificationHead.forwardI  sZ    Q1WLLOJJqMOOALLOMM!r   )ro  r   r   r  rv  r   s   @r-   r  r  <  s    7I r   r  c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjrS	rU =r$ )ElectraSequenceSummaryiT  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`ElectraConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
r^   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r   r   rP   r  NotImplementedErrorr	   Identitysummaryr   r  r  r  r   r   r   ro  first_dropoutr  r   last_dropoutr  )r   r^   num_classesactivation_stringr   s       r-   r   ElectraSequenceSummary.__init__n  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2r   r   	cls_indexr   c                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
r  Nrx   firstr   r  r    r   r  .r   r}   )rx   r  )r  r  rY   	full_likerU   r   	unsqueezer   r   r   gatherrr  r  r  r  ro  r  )r   r   r  r  s       r-   r   ElectraSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*r   )ro  r  r  r  r  r   )r   r   r   r   r   r!   r   rY   r   r   r   r   r   r   r   s   @r-   r  r  T  sV    2H} H< Y])"..);CEDTDT;U)			) )r   r  z
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ ) ElectraForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        U R                  5         g r   )	r   r   r  r^   r  r  r  r8   r  r   s     r-   r   )ElectraForSequenceClassification.__init__  sF      ++#F+3F; 	r   r   r   r|   rw   r   r   labelsr   rS  rT  r   c                 0   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r|   rw   r   r   r   rS  rT  r   r    
regressionsingle_label_classificationmulti_label_classificationrx   r  rt  r   rZ  )r^   r  r  r8   problem_typer  r~   rY   r   rR   r   rr  r   r   r
   r   r   rZ  )r   r   r   r|   rw   r   r   r  r   rS  rT  rs  sequence_outputrt  r  loss_fctr  s                    r-   r   (ElectraForSequenceClassification.forward  s   ( &1%<k$++B]B]&*ll))%'/!5# '3 
'
# 6a81{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y!<QR!@@F)-)9TGf$EvE'5CC2==	
 	
r   )r8   r^   r  r  
NNNNNNNNNN)r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*D
ELL)D
 !.D
 !.	D

 u||,D
 ELL)D
  -D
 &D
 $D>D
 'tnD
 d^D
 
uU\\"$<<	=D
 D
r   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )ElectraForPreTrainingi  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r   r   r  r  rl  discriminator_predictionsr  r   s     r-   r   ElectraForPreTraining.__init__  s3     #F+)H)P&r   r   r   r|   rw   r   r   r  r   rS  rT  r   c                 l   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUb  [        R
                  " 5       nUba  UR                  SUR                  S   5      S:H  nUR                  SUR                  S   5      U   nUU   nU" UUR                  5       5      nO4U" UR                  SUR                  S   5      UR                  5       5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
    Indices should be in `[0, 1]`:

    - 0 indicates the token is an original token,
    - 1 indicates the token was replaced.

Examples:

```python
>>> from transformers import ElectraForPreTraining, AutoTokenizer
>>> import torch

>>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
>>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

>>> sentence = "The quick brown fox jumps over the lazy dog"
>>> fake_sentence = "The quick brown fox fake over the lazy dog"

>>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
>>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
>>> discriminator_outputs = discriminator(fake_inputs)
>>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

>>> fake_tokens
['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']

>>> predictions.squeeze().tolist()
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
```Nr  r   rx   r    r  )r^   r  r  r  r	   r
   r   rU   floatr  r   rZ  )r   r   r   r|   rw   r   r   r  r   rS  rT  rs  discriminator_sequence_outputrt  r  r  active_lossactive_logitsactive_labelsr  s                       r-   r   ElectraForPreTraining.forward   sp   Z &1%<k$++B]B]&*ll))%'/!5# '3 
'
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d &{ 3}/B/B/DEB0M0S0STU0V WY_YeYeYghY!<QR!@@F)-)9TGf$EvE*5CC2==	
 	
r   )r  r  r  )r   r   r   r   r   r   r   rY   r   r   r   r   r  r   r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   r  z
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    c                   ~  ^  \ rS rSrS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )rK   iu  generator_lm_head.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  UR                  5      U l
        U R                  5         g r   )r   r   r  r  rx  generator_predictionsr	   r   r   r   generator_lm_headr  r   s     r-   r   ElectraForMaskedLM.__init__  sR     #F+%@%H"!#6+@+@&BSBS!Tr   c                     U R                   $ r   r  r  s    r-   get_output_embeddings(ElectraForMaskedLM.get_output_embeddings      %%%r   c                     Xl         g r   r   )r   r   s     r-   set_output_embeddings(ElectraForMaskedLM.set_output_embeddings  s    !0r   r   r   r|   rw   r   r   r  r   rS  rT  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUbQ  [
        R                  " 5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Nr  r   rx   r    r  )r^   r  r  r  r  r	   r   r   r   r   r   rZ  )r   r   r   r|   rw   r   r   r  r   rS  rT  r}  generator_sequence_outputprediction_scoresr  r  r  s                    r-   r   ElectraForMaskedLM.forward  s   ( &1%<k$++B]B]"&,,))%'/!5# #/ 
#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
r   r  r  r  r  )r   r   r   r   _tied_weights_keysr   r  r  r   r   rY   r   r   r   r   r   r   r   r   r   s   @r-   rK   rK   u  s    55&1  -11515/3,004)-,0/3&*4
ELL)4
 !.4
 !.	4

 u||,4
 ELL)4
  -4
 &4
 $D>4
 'tn4
 d^4
 
uU\\"N2	34
 4
r   rK   z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )ElectraForTokenClassificationi  c                 f  > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r   r   r  r  r  r  r   r	   r   r   r   r   r8   r  r  s      r-   r   &ElectraForTokenClassification.__init__  s      ++#F+)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJr   r   r   r|   rw   r   r   r  r   rS  rT  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   rx   r    r  )r^   r  r  r   r8   r   r   r  r   r   rZ  )r   r   r   r|   rw   r   r   r  r   rS  rT  rs  r  rt  r  r  r  s                    r-   r   %ElectraForTokenClassification.forward  s    $ &1%<k$++B]B]&*ll))%'/!5# '3 
'
# )DA(F%(,5R(S%!>?')HFKKDOO<fkk"oNDY!<QR!@@F)-)9TGf$EvE$5CC2==	
 	
r   )r8   r   r  r  r  )r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*1
ELL)1
 !.1
 !.	1

 u||,1
 ELL)1
  -1
 &1
 $D>1
 'tn1
 d^1
 
uU\\"$99	:1
 1
r   r  c                     ^  \ rS rSr\rSrU 4S jr\           SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )ElectraForQuestionAnsweringi  r  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
r   r   r  r  r  r	   r   r   
qa_outputsr  r   s     r-   r   $ElectraForQuestionAnswering.__init__  sS      ++#F+))F$6$68I8IJ 	r   r   r   r|   rw   r   r   start_positionsend_positionsr   rS  rT  r   c                 "   Ub  UOU R                   R                  nU R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	N)r   r|   rw   r   r   r   rS  r   r    rx   r   )ignore_indexr9   )r  start_logits
end_logitsr   rZ  )r^   r  r  r  rM   rr  r   rQ   r   clampr   r   r   rZ  )r   r   r   r|   rw   r   r   r  r  r   rS  rT  rs  r  rt  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r-   r   #ElectraForQuestionAnswering.forward!  s    &1%<k$++B]B]&*ll))%'/!5 '3 	'
# 6a81#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J ,AB/0F 0:/EZMF*Q6Q+%!5CC2==
 	
r   )r  r  r  )NNNNNNNNNNN)r   r   r   r   r!   r  r  r   r   r   rY   r   r   r   r   r   r   r   r   r   s   @r-   r  r    s6    L!  -11515/3,0042604,0/3&*@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
 "%,,/@
  -@
 $D>@
 'tn@
 d^@
 
uU\\"$@@	A@
 @
r   r  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )ElectraForMultipleChoiceie  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  S5      U l	        U R                  5         g rn  )r   r   r  r  r  sequence_summaryr	   r   r   r8   r  r   s     r-   r   !ElectraForMultipleChoice.__init__g  sM     #F+ 6v >))F$6$6: 	r   r   r   r|   rw   r   r   r  r   rS  rT  r   c                 \   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr    rx   r   r  r   r  )r^   r  rU   r   r   r  r'  r8   r   r   r   rZ  )r   r   r   r|   rw   r   r   r  r   rS  rT  num_choicesrs  r  pooled_outputrt  reshaped_logitsr  r  r  s                       r-   r    ElectraForMultipleChoice.forwardq  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 '+ll))%'/!5# '3 
'
# 6a8--o>/ ++b+6')HOV4D%'*Eab*IIF)-)9TGf$EvE("5CC2==	
 	
r   )r8   r  r'  r  )r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   s   @r-   r%  r%  e  s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   r%  zS
    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
    c            "         ^  \ rS rSrS/rU 4S jrS rS r\              SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rS rSrU =r$ )ElectraForCausalLMi  r  c                 0  > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g )NzOIf you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`)r   r   r   r?   warningr  r  rx  r  r	   r   r   r   r  init_weightsr   s     r-   r   ElectraForCausalLM.__init__  sj       NNlm#F+%@%H"!#6+@+@&BSBS!Tr   c                     U R                   $ r   r   r  s    r-   r  (ElectraForCausalLM.get_output_embeddings  r  r   c                     Xl         g r   r   )r   new_embeddingss     r-   r  (ElectraForCausalLM.set_output_embeddings  s    !/r   r   r   r|   rw   r   r   r   r   r  rR  r   r   rS  rT  r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUS9nUS   nU R                  U R	                  U5      5      nSnU	b*  U R
                  " UU	4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
>>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
>>> config.is_decoder = True
>>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NF)r   r|   rw   r   r   r   r   rR  r   r   rS  rT  r   r   r    )r  rt  rR  r   rZ  r[  )r^   r  r  r  r  loss_functionr   r   rR  r   rZ  r[  )r   r   r   r|   rw   r   r   r   r   r  rR  r   r   rS  rT  r  r   r  r	  lm_lossr  s                        r-   r   ElectraForCausalLM.forward  s,   R &1%<k$++B]B]I,,))%'"7#9+/!5#  
  "!* 2243M3Mo3^_((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr*   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   r   )r+   
past_statebeam_idxs     r-   r.   4ElectraForCausalLM._reorder_cache.<locals>.<genexpr>B  s1     ncmU_--aZ=N=N1OPPcms   7:)ra  )r   rR  rA  reordered_past
layer_pasts     `  r-   _reorder_cache!ElectraForCausalLM._reorder_cache>  s8    )Jncmnn N * r   r  )NNNNNNNNNNNNNN)r   r   r   r   r  r   r  r  r   r   rY   r   r   r   r   r   r   r   rE  r   r   r   s   @r-   r/  r/    s    55
&0  -11515/3,0048<9=)-8<$(,0/3&*S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 &S
 "$u||"45S
 D>S
 $D>S
 'tnS
 d^S
" 
uU\\"$EE	F#S
 S
l r   r/  )
r/  rK   r%  r  r  r  r  r  r  ro   )discriminator)Ir   r   rA   dataclassesr   typingr   r   r   r   r   rY   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   r   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_electrar!   
get_loggerr   r?   ro   Modulerq   r   r   r  r  r!  r,  r2  rI  rl  rx  r  r  r  r  r  r  r  rK   r  r  r%  r/  __all__r*   r   r-   <module>rV     s     	 ! 9 9    A A 1 )	 	 	 . l l 
 1 
		H	%Od?		 ?FC299 CN		  !" 0ryy 0h"))  BII S299 SnZ
RYY Z
zbii &")) $ *_ * *. :+ : :8 s) s sl		 0`RYY `F P
'= P
P
f [
2 [
[
| H
/ H
H
V @
$: @
@
F O
"8 O
 O
d d
5 d
 d
N 
r/ r
rjr   