
    fTh@                        S r SSKrSSKrSSKJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJrJrJr  SS
KJr  SSKJrJrJ r   SSK!J"r"J#r#  SSK$J%r%  \#RL                  " \'5      r(S r) " S S\	RT                  5      r+ " S S\	RT                  5      r, " S S\	RT                  5      r- " S S\	RT                  5      r. " S S\	RT                  5      r/ " S S\	RT                  5      r0 " S S\	RT                  5      r1 " S S\	RT                  5      r2 " S  S!\	RT                  5      r3 " S" S#\	RT                  5      r4 " S$ S%\	RT                  5      r5 " S& S'\	RT                  5      r6\" " S( S)\5      5       r7\"" S*S+9 " S, S-\75      5       r8\" " S. S/\75      5       r9\"" S0S+9 " S1 S2\7\5      5       r:\"" S3S+9 " S4 S5\75      5       r;\" " S6 S7\75      5       r<\" " S8 S9\75      5       r=\" " S: S;\75      5       r>/ S<Qr?g)=zPyTorch RemBERT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )RemBertConfigc           
        ^  SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H{  u  mn
[        U4S jS 5       5      (       a  M"  [        R                  ST SU
 35        UR                  R                  UT5      nUR                  T5        U	R                  U5        M}     [        X5       GH  u  mnTR!                  S	S
5      mTR#                  S5      m[        S T 5       5      (       a)  [        R                  SSR%                  T5       35        Mj  U nT H  nUR'                  SU5      (       a  UR#                  SU5      nOU/nUS   S:X  d	  US   S:X  a  [)        US5      nOZUS   S:X  d	  US   S:X  a  [)        US5      nO;US   S:X  a  [)        US5      nO%US   S:X  a  [)        US5      nO [)        XS   5      n[/        U5      S:  d  M  [1        US   5      nX   nM     WSS S:X  a  [)        US5      nOUS:X  a  UR3                  U5      n UR4                  UR4                  :w  a&  [7        SUR4                   SUR4                   S 35      e [        R                  S!T 35        [<        R>                  " U5      Ul         GM     U $ ! [         a    [        R                  S5        e f = f! [*         a8    [        R                  SR-                  SR%                  T5      5      5         GM  f = f! [8         a1  nU=R:                  UR4                  UR4                  4-  sl        e SnAff = f)"z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from c              3   ,   >#    U  H	  oT;   v   M     g 7fN ).0denynames     d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/rembert/modeling_rembert.py	<genexpr>-load_tf_weights_in_rembert.<locals>.<genexpr>E   s     X(Wt|(Ws   )adam_vadam_moutput_embeddingclszLoading TF weight z with shape zbert/zrembert//c              3   ,   #    U  H
  nUS ;   v   M     g7f))r'   r(   AdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepNr    )r!   ns     r$   r%   r&   V   s      
 nns   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifierzSkipping {}   r   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )!renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesanyload_variableappendzipreplacesplitjoin	fullmatchgetattrAttributeErrorformatlenint	transposeshape
ValueErrorAssertionErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr<   nptftf_path	init_varsnamesarraysrV   arraypointerm_namescope_namesnumer#   s                    @r$   load_tf_weights_in_rembertrl   /   sL   
 ggoo01G
KK8	BC''0IEF e X(WXXX(l5'BC&&w5Te ! 5)e||GZ0 zz#  

 
 
 KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g1~>G ;1$+a.)!,+ , #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.c *d LS  Q	
 	n & KK 4 4SXXd^ DE  	FFw}}ekk22F	s6   K :K<A M!K9<=L>=L>
M<,M77M<c                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )RemBertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  g )N)padding_idxepsposition_ids)r   F)
persistent)super__init__r   	Embedding
vocab_sizeinput_embedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrZ   arangeexpandselfr^   	__class__s     r$   rx   RemBertEmbeddings.__init__   s    !||v::H[H[ 
 $&<<0N0NPVPkPk#l %'\\&2H2H&JeJe%f" f&A&AvG\G\]zz&"<"<= 	ELL)G)GHOOPWXej 	 	
    	input_idstoken_type_idsrt   inputs_embedspast_key_values_lengthreturnc                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nXH-   n	U R                  U5      n
X-  n	U R                  U	5      n	U R                  U	5      n	U	$ )Nru   r   dtypedevice)sizert   rZ   zeroslongr   r}   r   r   r   r   )r   r   r   rt   r   r   input_shape
seq_lengthr   
embeddingsr   s              r$   forwardRemBertEmbeddings.forward   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J":
"66|D)
^^J/
\\*-
r   )r   r   r   r   r}   )NNNNr   )__name__
__module____qualname____firstlineno____doc__rx   r   rZ   
LongTensorFloatTensorrT   Tensorr   __static_attributes____classcell__r   s   @r$   rn   rn      s    Q
( 15593759&'E,,- !!1!12 u//0	
   1 12 !$ 
 r   rn   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertPooler   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rw   rx   r   Linearhidden_sizedenseTanh
activationr   s     r$   rx   RemBertPooler.__init__   s9    YYv1163E3EF
'')r   hidden_statesr   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r   r   first_token_tensorpooled_outputs       r$   r   RemBertPooler.forward   s6     +1a40

#566r   )r   r   
r   r   r   r   rx   rZ   r   r   r   r   r   s   @r$   r   r      s(    $
U\\ ell  r   r   c                   
  ^  \ rS rSrU 4S jrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\S\
4S jjrSrU =r$ )RemBertSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        UR"                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())rw   rx   r   num_attention_headshasattrrW   rT   attention_head_sizeall_head_sizer   r   querykeyvaluer   attention_probs_dropout_probr   
is_decoderr   s     r$   rx   RemBertSelfAttention.__init__   s,    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++r   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nru   r   r:   r   r
   )r   r   r   viewpermute)r   xnew_x_shapes      r$   transpose_for_scores)RemBertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsr   c                    U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nU R                  (       a  X4n[        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      nUb  X-  n[        R                  " X5      nUR!                  SSSS5      R#                  5       nUR%                  5       S S U R&                  4-   nUR(                  " U6 nU(       a  X4OU4nU R                  (       a  UU4-   nU$ )Nr   r   r:   dimru   r
   )r   r   r   r   rZ   catr   matmulrU   mathsqrtr   r   
functionalsoftmaxr   r   
contiguousr   r   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                     r$   r   RemBertSelfAttention.forward   sJ    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB?? (5N !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2mM]?? 11Gr   )r   r   r   r   r   r   r   r   NNNNNF)r   r   r   r   rx   r   rZ   r   r   r   r   boolr   r   r   r   s   @r$   r   r      s    ,(% 7;15=A>BDH"'L||L !!2!23L E--.	L
  ((9(9:L !)):): ;L !uU->->'?!@AL  L 
L Lr   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RemBertSelfOutputi2  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nrr   )rw   rx   r   r   r   r   r   r   r   r   r   r   s     r$   rx   RemBertSelfOutput.__init__3  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   r   r   r   r   s      r$   r   RemBertSelfOutput.forward9  5    

=1]3}'CDr   r   r   r   r   r   s   @r$   r   r   2  6    >U\\  RWR^R^  r   r   c                   *  ^  \ rS rSrU 4S jrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )RemBertAttentioni@  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )rw   rx   r   r   r   outputsetpruned_headsr   s     r$   rx   RemBertAttention.__init__A  s0    (0	'/Er   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )rS   r   r   r   r   r  r   r   r   r   r   r   r   union)r   headsindexs      r$   prune_headsRemBertAttention.prune_headsH  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   )r   r   )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r$   r   RemBertAttention.forward[  sW     yy!"
  ;;|AF#%QR(88r   )r   r  r   r   )r   r   r   r   rx   r  rZ   r   r   r   r   r   r   r   r   r   s   @r$   r   r   @  s    ";, 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertIntermediateit  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rw   rx   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r$   rx   RemBertIntermediate.__init__u  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  r   r   s     r$   r   RemBertIntermediate.forward}  s&    

=100?r   r  r   r   s   @r$   r  r  t  s(    9U\\ ell  r   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RemBertOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rw   rx   r   r   r  r   r   r   r   r   r   r   r   s     r$   rx   RemBertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r$   r   RemBertOutput.forward  r   r   r   r   r   s   @r$   r  r    r   r   r  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )RemBertLayeri  c                 v  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        U5      U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is added)rw   rx   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionrW   crossattentionr  intermediater  r   r   s     r$   rx   RemBertLayer.__init__  s    '-'E'E$)&1 ++#)#=#= ##?? D6)g!hii"26":D/7#F+r   r   r   r   r   r   r   r   r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr:   )r   r   r   r   ru   r(  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r&  r   r   rW   r(  r   feed_forward_chunkr$  r%  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r$   r   RemBertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r)  r   )r   r  intermediate_outputr3  s       r$   r,  RemBertLayer.feed_forward_chunk  s)    "//0@A{{#6Ir   )r'  r&  r$  r(  r)  r   r   r%  r   )r   r   r   r   rx   rZ   r   r   r   r   r   r   r,  r   r   r   s   @r$   r"  r"    s    ,$ 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?D r   r"  c                   &  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\
S\
S\
S\\	\4   4S jjrSrU =r$ )RemBertEncoderi  c                 6  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l        g s  snf )NF)rw   rx   r^   r   r   r{   r   embedding_hidden_mapping_in
ModuleListrangenum_hidden_layersr"  layergradient_checkpointing)r   r^   _r   s      r$   rx   RemBertEncoder.__init__  sr    +-99V5P5PRXRdRd+e(]]%H`H`Ba#bBaQL$8Ba#bc
&+# $cs   -Br   r   r   r   r   past_key_values	use_cacher   output_hidden_statesreturn_dictr   c                 Z   U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU R	                  U5      nU	(       a  SOS nU(       a  SOS nU(       a  U R
                  R                  (       a  SOS nU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                   (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R
                  R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr    r   ru   r   r:   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r    )r!   vs     r$   r%   )RemBertEncoder.forward.<locals>.<genexpr>5  s"      
A  s   	)last_hidden_staterC  r   
attentionscross_attentions)r@  trainingr@   warning_oncer;  r^   r'  	enumerater?  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   rC  rD  r   rE  rF  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r$   r   RemBertEncoder.forward  s    &&4==##p "	88G"6BD$5b4%64;;;Z;Zr`d#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r   )r^   r;  r@  r?  )	NNNNNNFFT)r   r   r   r   rx   rZ   r   r   r   r   r   r   r   r   r   r   r   s   @r$   r9  r9    s    , 7;15=A>BEI$("'%* S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
  S
 #S
 S
 
u??	@S
 S
r   r9  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertPredictionHeadTransformiJ  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rw   rx   r   r   r   r   r  r  r  r   transform_act_fnr   r   r   s     r$   rx   'RemBertPredictionHeadTransform.__init__K  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r`  r   r  s     r$   r   &RemBertPredictionHeadTransform.forwardT  s4    

=1--m<}5r   )r   r   r`  r   r   s   @r$   r^  r^  J  s)    UU\\ ell  r   r^  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertLMPredictionHeadi[  c                 n  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l        [        R                  " UR
                  UR                  S9U l        g r   )rw   rx   r   r   r   output_embedding_sizer   rz   decoderr   r  r   r   r   r   s     r$   rx    RemBertLMPredictionHead.__init__\  sz    YYv1163O3OP
yy!=!=v?P?PQ !2!23f&B&BH]H]^r   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rh  r  s     r$   r   RemBertLMPredictionHead.forwardc  s@    

=16}5]3r   )r   r   rh  r   r   r   s   @r$   re  re  [  s)    _U\\ ell  r   re  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertOnlyMLMHeadil  c                 B   > [         TU ]  5         [        U5      U l        g r   )rw   rx   re  predictionsr   s     r$   rx   RemBertOnlyMLMHead.__init__m  s    26:r   sequence_outputr   c                 (    U R                  U5      nU$ r   ro  )r   rq  prediction_scoress      r$   r   RemBertOnlyMLMHead.forwardq  s     ,,_=  r   rs  r   r   s   @r$   rm  rm  l  s(    ;!u|| ! ! !r   rm  c                   *    \ rS rSr\r\rSrSr	S r
Srg)RemBertPreTrainedModeliv  rembertTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)r  r   r   r3   r\   normal_r^   initializer_ranger6   zero_ry   rq   r   fill_)r   modules     r$   _init_weights$RemBertPreTrainedModel._init_weights}  s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r   r    N)r   r   r   r   r   config_classrl   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r   r    r   r$   rw  rw  v  s     L0O!&*#*r   rw  a  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                      ^  \ rS rSrSU 4S jjrS rS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\\
R                           S\	\   S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )RemBertModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rw   rx   r^   rn   r   r9  encoderr   pooler	post_init)r   r^   add_pooling_layerr   s      r$   rx   RemBertModel.__init__  sK    
 	 +F3%f-/@mF+d 	r   c                 .    U R                   R                  $ r   r   r}   r   s    r$   get_input_embeddings!RemBertModel.get_input_embeddings  s    ...r   c                 $    XR                   l        g r   r  )r   r   s     r$   set_input_embeddings!RemBertModel.set_input_embeddings  s    */'r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r?  r&  r  )r   heads_to_pruner?  r  s       r$   _prune_headsRemBertModel._prune_heads  s<    
 +002LELLu%//;;EB 3r   r   r   r   rt   r   r   r   r   rC  rD  r   rE  rF  r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUc  [        R                  " UUU-   4US9nUc$  [        R                  " U[        R                  US9nU R                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R!                  U5      nOS nU R#                  XPR                   R$                  5      nU R'                  UUUUUS	9nU R)                  UUUUUU	U
UUUS
9
nUS   nU R*                  b  U R+                  U5      OS nU(       d
  UU4USS  -   $ [-        UUUR.                  UR0                  UR2                  UR4                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timeru   z5You have to specify either input_ids or inputs_embedsr   r:   )r   r   )r   rt   r   r   r   )	r   r   r   r   rC  rD  r   rE  rF  r   )rK  pooler_outputrC  r   rL  rM  )r^   r   rE  use_return_dictr   rD  rW   %warn_if_padding_and_no_attention_maskr   r   rV   rZ   onesr   r   get_extended_attention_maskinvert_attention_maskget_head_maskr>  r   r  r  r   rC  r   rL  rM  )r   r   r   r   rt   r   r   r   r   rC  rD  r   rE  rF  r   
batch_sizer   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthrA  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputsrq  r   s                                r$   r   RemBertModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )r^   r   r  r  )T)NNNNNNNNNNNNN)r   r   r   r   rx   r  r  r  r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r  r    sw    /0C  155959371559=A>BEI$(,0/3&*f
E,,-f
 !!1!12f
 !!1!12	f

 u//0f
 E--.f
   1 12f
  ((9(9:f
 !)):): ;f
 "%e.?.?(@"ABf
 D>f
 $D>f
 'tnf
 d^f
 
uBB	Cf
 f
r   r  c                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSS jr\S\4S j5       rSrU =r$ )RemBertForMaskedLMi#  cls.predictions.decoder.weightc                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NznIf you want to use `RemBertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  
rw   rx   r   r@   warningr  rx  rm  r*   r  r   s     r$   rx   RemBertForMaskedLM.__init__'  sR     NN1
 $FeD%f- 	r   c                 B    U R                   R                  R                  $ r   r*   ro  rh  r  s    r$   get_output_embeddings(RemBertForMaskedLM.get_output_embeddings6      xx##+++r   c                 8    XR                   R                  l        g r   r  r   new_embeddingss     r$   set_output_embeddings(RemBertForMaskedLM.set_output_embeddings9      '5$r   r   r   r   rt   r   r   r   r   labelsr   rE  rF  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	bF  [	        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)
r   r   rt   r   r   r   r   r   rE  rF  r   ru   r:   losslogitsr   rL  )
r^   r  rx  r*   r   r   rz   r   r   rL  )r   r   r   r   rt   r   r   r   r   r  r   rE  rF  r   rq  rt  masked_lm_lossloss_fctr   s                      r$   r   RemBertForMaskedLM.forward<  s    , &1%<k$++B]B],,))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    UR                   nUS   nU R                  R                  c   S5       e[        R                  " X"R                  UR                   S   S45      /SS9n[        R                  " US4U R                  R                  [        R                  UR                  S9n[        R                  " X/SS9nXS.$ )Nr   z.The PAD token should be defined for generationr   ru   r   r   )r   r   )	rV   r^   r|   rZ   r   	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r$   prepare_inputs_for_generation0RemBertForMaskedLM.prepare_inputs_for_generationu  s    oo*1~ {{''3e5ee3N4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy6A>	&IIr   c                     g)z
Legacy correction: RemBertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
`prepare_inputs_for_generation` method.
Fr    )r*   s    r$   can_generateRemBertForMaskedLM.can_generate  s     r   r*   rx  )NNNNNNNNNNNNr   )r   r   r   r   _tied_weights_keysrx   r  r  r   r   rZ   r   r   r   r   r   r   r   r  classmethodr  r   r   r   s   @r$   r  r  #  sp   :;,6  155959371559=A>B-1,0/3&*6
E,,-6
 !!1!126
 !!1!12	6

 u//06
 E--.6
   1 126
  ((9(9:6
 !)):): ;6
 ))*6
 $D>6
 'tn6
 d^6
 
un$	%6
 6
pJ T  r   r  zS
    RemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "         ^  \ rS rSrS/rU 4S jrS rS r\              SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\\
R                           S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4S jj5       rS rSrU =r$ )RemBertForCausalLMi  r  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzOIf you want to use `RemBertForCausalLM` as a standalone, add `is_decoder=True.`Fr  r  r   s     r$   rx   RemBertForCausalLM.__init__  sL       NNlm#FeD%f- 	r   c                 B    U R                   R                  R                  $ r   r  r  s    r$   r  (RemBertForCausalLM.get_output_embeddings  r  r   c                 8    XR                   R                  l        g r   r  r  s     r$   r  (RemBertForCausalLM.set_output_embeddings  r  r   r   r   r   rt   r   r   r   r   rC  r  rD  r   rE  rF  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUUS9nUS   nU R                  U5      nSnU
b*  U R                  " UU
4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, RemBertForCausalLM, RemBertConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/rembert")
>>> config = RemBertConfig.from_pretrained("google/rembert")
>>> config.is_decoder = True
>>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```N)r   r   rt   r   r   r   r   rC  rD  r   rE  rF  r   rz   r:   )r  r  rC  r   rL  rM  )r^   r  rx  r*   loss_functionrz   r   rC  r   rL  rM  )r   r   r   r   rt   r   r   r   r   rC  r  rD  r   rE  rF  kwargsr   rq  rt  lm_lossr   s                        r$   r   RemBertForCausalLM.forward  s   R &1%<k$++B]B],,))%'"7#9+/!5#  
  "!* HH_5((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 b   ^ SnU H%  nU[        U4S jUS S  5       5      USS  -   4-  nM'     U$ )Nr    c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selecttor   )r!   
past_statebeam_idxs     r$   r%   4RemBertForCausalLM._reorder_cache.<locals>.<genexpr>  s1     rcqU_--aZ=N=N1OPPcqs   7:r:   )rS  )r   rC  r  reordered_past
layer_pasts     `  r$   _reorder_cache!RemBertForCausalLM._reorder_cache  sO    )JrcmnpopcqrrQR.! N *
 r   r  )NNNNNNNNNNNNNN)r   r   r   r   r  rx   r  r  r   r   rZ   r   r   r   r   r   r   r   r  r   r   r   s   @r$   r  r    s    ;;
,6  155959371559=A>BEI-1$(,0/3&*Q
E,,-Q
 !!1!12Q
 !!1!12	Q

 u//0Q
 E--.Q
   1 12Q
  ((9(9:Q
 !)):): ;Q
 "%e.?.?(@"ABQ
 ))*Q
 D>Q
 $D>Q
 'tnQ
 d^Q
" 
u77	8#Q
 Q
f r   r  z
    RemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ ) RemBertForSequenceClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rw   rx   
num_labelsr  rx  r   r   classifier_dropout_probr   r   r   r9   r  r   s     r$   rx   )RemBertForSequenceClassification.__init__  si      ++#F+zz&"@"@A))F$6$68I8IJ 	r   r   r   r   rt   r   r   r  r   rE  rF  r   c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r   rt   r   r   r   rE  rF  r   
regressionsingle_label_classificationmulti_label_classificationru   r:   r  )r^   r  rx  r   r9   problem_typer  r   rZ   r   rT   r	   squeezer   r   r   r   r   rL  )r   r   r   r   rt   r   r   r  r   rE  rF  r   r   r  r  r  r   s                    r$   r   (RemBertForSequenceClassification.forward  s   ( &1%<k$++B]B],,))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   r9   r   r  rx  
NNNNNNNNNN)r   r   r   r   rx   r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r  r    s     266:59481559-1,0/3&*E
E--.E
 !!2!23E
 !!1!12	E

 u001E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
r   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )RemBertForMultipleChoicei^  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )rw   rx   r  rx  r   r   r  r   r   r   r9   r  r   s     r$   rx   !RemBertForMultipleChoice.__init__`  sV     #F+zz&"@"@A))F$6$6: 	r   r   r   r   rt   r   r   r  r   rE  rF  r   c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   ru   r   r  r:   r  )r^   r  rV   r   r   rx  r   r9   r   r   r   rL  )r   r   r   r   rt   r   r   r  r   rE  rF  num_choicesr   r   r  reshaped_logitsr  r  r   s                      r$   r    RemBertForMultipleChoice.forwardj  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ,,))%'/!5#  

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   )r9   r   rx  r  )r   r   r   r   rx   r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r  r  ^  s     266:59481559-1,0/3&*X
E--.X
 !!2!23X
 !!1!12	X

 u001X
 E--.X
   1 12X
 ))*X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
r   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )RemBertForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g NFr  r  r   s     r$   rx   &RemBertForTokenClassification.__init__  sk      ++#FeDzz&"@"@A))F$6$68I8IJ 	r   r   r   r   rt   r   r   r  r   rE  rF  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   ru   r:   r  )r^   r  rx  r   r9   r   r   r  r   r   rL  )r   r   r   r   rt   r   r   r  r   rE  rF  r   rq  r  r  r  r   s                    r$   r   %RemBertForTokenClassification.forward  s    $ &1%<k$++B]B],,))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  r  )r   r   r   r   rx   r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r	  r	    s   	  266:59481559-1,0/3&*2
E--.2
 !!2!232
 !!1!12	2

 u0012
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
r   r	  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )RemBertForQuestionAnsweringi	  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
rw   rx   r  r  rx  r   r   r   
qa_outputsr  r   s     r$   rx   $RemBertForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	r   r   r   r   rt   r   r   start_positionsend_positionsr   rE  rF  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5        UR                  SU5        [        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   ru   r   )ignore_indexr:   )r  start_logits
end_logitsr   rL  )r^   r  rx  r  rM   r  rS   r   clamp_r   r   r   rL  )r   r   r   r   rt   r   r   r  r  r   rE  rF  r   rq  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          r$   r   #RemBertForQuestionAnswering.forward  s    &1%<k$++B]B],,))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )r  r  rx  )NNNNNNNNNNN)r   r   r   r   rx   r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r  r  	  s$   	  266:594815596:48,0/3&*>
E--.>
 !!2!23>
 !!1!12	>

 u001>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r   r  )
r  r  r  r  r  r	  r"  r  rw  rl   )@r   r   rB   typingr   r   r   rZ   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_rembertr   
get_loggerr   r@   rl   Modulern   r   r   r   r   r  r  r"  r9  r^  re  rm  rw  r  r  r  r  r  r	  r  __all__r    r   r$   <module>r-     sN     	 ) )    A A ! )	 	 	 . l l , 0 
		H	%Pf3		 3nBII f299 fT		 0ryy 0h"))  BII U299 Up\
RYY \
@RYY "bii "! ! *_ * *. 	F
) F
F
R e/ e eP 
p/ p
pf Q
'= Q
Q
h d
5 d
 d
N ?
$: ?
 ?
D K
"8 K
 K
\r   