
    fTh7                    @   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  SSK+J,r,J-r-J.r.J/r/  SSK0J1r1  \/Rd                  " \35      r4S r5 " S S\Rl                  5      r7 " S S\Rl                  5      r8 " S S\85      r9 " S S\Rl                  5      r:\8\9S.r; " S S\Rl                  5      r< " S S\Rl                  5      r= " S  S!\Rl                  5      r> " S" S#\Rl                  5      r? " S$ S%\Rl                  5      r@ " S& S'\Rl                  5      rA " S( S)\Rl                  5      rB " S* S+\Rl                  5      rC " S, S-\Rl                  5      rD " S. S/\Rl                  5      rE " S0 S1\Rl                  5      rF\- " S2 S3\&5      5       rG\ " S4 S5\,5      5       rH\-" S6S79 " S8 S9\G5      5       rI\-" S:S79 " S; S<\G5      5       rJ\-" S=S79 " S> S?\G\5      5       rK\- " S@ SA\G5      5       rL\-" SBS79 " SC SD\G5      5       rM\-" SES79 " SF SG\G5      5       rN\- " SH SI\G5      5       rO\- " SJ SK\G5      5       rP\- " SL SM\G5      5       rQ/ SNQrRg)OzPyTorch BERT model.    N)	dataclass)ListOptionalTupleUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringget_torch_versionlogging   )
BertConfigc           	          SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H]  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        U	R                  U5        M_     [        X5       GH  u  pU
R                  S5      n
[!        S U
 5       5      (       a)  [        R                  S	SR#                  U
5       35        MW  U nU
 H  nUR%                  S
U5      (       a  UR                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOZUS   S:X  d	  US   S:X  a  ['        US5      nO;US   S:X  a  ['        US5      nO%US   S:X  a  ['        US5      nO ['        XS   5      n[+        U5      S:  d  M  [-        US   5      nUU   nM     WSS S:X  a  ['        US5      nOUS:X  a  UR/                  U5      n UR0                  UR0                  :w  a&  [3        SUR0                   SUR0                   S35      e [        R                  SU
 35        [6        R8                  " U5      Ul        GM     U $ ! [         a    [        R                  S5        e f = f! [(         a,    [        R                  S	SR#                  U
5       35         GM  f = f! [2         a1  nU=R4                  UR0                  UR0                  4-  sl        e SnAff = f)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>*load_tf_weights_in_bert.<locals>.<genexpr>R   s      
 nns   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r#   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr>   nptftf_path	init_varsnamesarraysnamerV   arraypointerm_namescope_namesnumes                     r0   load_tf_weights_in_bertrl   5   s   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 ! 5)zz#  

 
 
 KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g1~>G ;1$+a.)!#,+ , #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.Y *Z L  Q	
 	Z & KK)CHHTN+; <=  	FFw}}ekk22F	s6   J' K,A L'!K1L L
L?,L::L?c                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )BertEmbeddings~   zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r#   F)
persistenttoken_type_idsdtype)super__init__r	   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrQ   rt   register_bufferrY   arangeexpandzerosrv   sizelongselfr]   	__class__s     r0   r}   BertEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsry   rv   inputs_embedspast_key_values_lengthreturnc                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nrw   r#   ry   r   r{   deviceru   )r   rv   hasattrry   r   rY   r   r   r   r   r   rt   r   r   r   )r   r   ry   rv   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r0   forwardBertEmbeddings.forward   sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r   )r   r   rt   r   r   r   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r}   r   rY   
LongTensorFloatTensorrT   Tensorr   __static_attributes____classcell__r   s   @r0   rn   rn   ~   s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r   rn   c                   b  ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jr      SS\R                  S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
\
\R                           S\\   S\
\R                     4S jjrSrU =r$ )BertSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rt   ru   relative_keyrelative_key_queryr<   r#   )r|   r}   r   num_attention_headsr   rW   rT   attention_head_sizeall_head_sizer	   Linearquerykeyvaluer   attention_probs_dropout_probr   rQ   rt   r   r~   distance_embedding
is_decoderr   r]   rt   r   s      r0   r}   BertSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r   xr   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nrw   r   r<   r#   r   )r   r   r   viewpermute)r   r   new_x_shapes      r0   transpose_for_scores&BertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 V   U R                  U5      nUS Ln	U	(       a  Ub  US   n
US   nUnGOU	(       aC  U R                  U R                  U5      5      n
U R                  U R                  U5      5      nUnOUbu  U R                  U R                  U5      5      n
U R                  U R                  U5      5      n[        R
                  " US   U
/SS9n
[        R
                  " US   U/SS9nO@U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U5      nUS LnU R                  (       a  X4n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  UR                  S   U
R                  S   nnU(       aB  [        R                  " US-
  [        R                  UR                  S	9R                  SS5      nO>[        R                  " U[        R                  UR                  S	9R                  SS5      n[        R                  " U[        R                  UR                  S	9R                  SS5      nUU-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S
9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nU[*        R,                  " U R.                  5      -  nUb  X-   n[0        R2                  R5                  USS9nU R7                  U5      nUb  UU-  n[        R                  " UU5      nUR9                  SSSS5      R;                  5       nUR=                  5       S S U R>                  4-   nUR                  U5      nU(       a  UU4OU4nU R                  (       a  UU4-   nU$ )Nr   r#   r<   dimrw   r   r   r   rz   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r   r   r   rY   catr   matmulrU   rt   rV   tensorr   r   r   r   r   r   tor{   einsummathsqrtr   r	   
functionalsoftmaxr   r   
contiguousr   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r0   r   BertSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr   )r   r   r   r   r   r   r   r   rt   r   r   NNNNNNF)r   r   r   r   r}   rY   r   r   r   r   r   boolr   r   r   r   s   @r0   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	c cr   r   c                   .  ^  \ rS rSrSU 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4U 4S jjjrSrU =r$ )BertSdpaSelfAttentioniD  c                    > [         TU ]  XS9  UR                  U l        [        R
                  " [        5       5      [        R
                  " S5      :  U l        g )Nrt   z2.2.0)r|   r}   r   dropout_probr   parser!   require_contiguous_qkvr   s      r0   r}   BertSdpaSelfAttention.__init__E  sE    Q"??&-mm4E4G&H7==Y`Ka&a#r   r   r   r   r   r   r   r   r   c           	        > U R                   S:w  d
  U(       d  Ub*  [        R                  S5        [        TU ]  UUUUUUU5      $ UR                  5       u  pn
U R                  U R                  U5      5      nUS LnU(       a  UOUnU(       a  UOUnU(       a/  U(       a(  US   R                  S   UR                  S   :X  a  Uu  pO~U R                  U R                  U5      5      nU R                  U R                  U5      5      nUb;  U(       d4  [        R                  " US   U/SS9n[        R                  " US   U/SS9nU R                  (       a  X4nU R                  (       aM  UR                  R                   S:X  a3  Ub0  UR#                  5       nUR#                  5       nUR#                  5       nU R                  (       a  U(       d  Uc  U	S:  a  SOS	n[        R$                  R&                  R)                  UUUUU R*                  (       a  U R,                  OS
US9nUR/                  SS5      nUR1                  XU R2                  5      nU4nU R                  (       a  UU4-   nU$ )Nru   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r<   r#   r   cudaTF        )	attn_mask	dropout_p	is_causal)rt   rB   warning_oncer|   r   r   r   r   rV   r   r   rY   r   r   r   r   typer   r	   r   scaled_dot_product_attentiontrainingr   rU   reshaper   )r   r   r   r   r   r   r   r   bsztgt_len_r   r   current_statesr   r   r  attn_outputr   r   s                      r0   r   BertSdpaSelfAttention.forwardK  sZ    '':59JiNcH 7?%&!  (,,.a//

=0IJ 3$>2D.-3E/> .^A5F5L5LQ5OSaSgSghiSj5j%3"I{11$((>2JKI33DJJ~4NOK)2D!II~a'8)&D!L	#ii):K(HaP?? (5N
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OO,>>CY^ehi^iDot 	 hh))FF$+/==d''c G 
 "++Aq1!))#8J8JK.?? 11Gr   )r   r   r   r   )r   r   r   r   r}   rY   r   r   r   r   r   r   r   r   r   s   @r0   r   r   D  s    b 2615=A>BDH,1[||[ !.[ E--.	[
  ((9(9:[ !)):): ;[ !uU->->'?!@A[ $D>[ 
u||	[ [r   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BertSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nrr   )r|   r}   r	   r   r   denser   r   r   r   r   r   s     r0   r}   BertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r   r   r   r   r  s      r0   r   BertSelfOutput.forward  5    

=1]3}'CDr   r   r  r   
r   r   r   r   r}   rY   r   r   r   r   r   s   @r0   r  r    6    >U\\  RWR^R^  r   r  )eagersdpac                   .  ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
\
\R                           S
\\   S\
\R                     4S jjrSrU =r$ )BertAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr   )	r|   r}   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s      r0   r}   BertAttention.__init__  s@    /0K0KL
	 %V,Er   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r#   r   )rS   r   r   r   r   r*  r   r   r   r   r(  r  r   union)r   headsindexs      r0   prune_headsBertAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    U R                  UUUUUUU5      nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r#   )r   r(  )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r0   r   BertAttention.forward  sW     yy!"
  ;;|AF#%QR(88r   )r(  r*  r   r   r   )r   r   r   r   r}   r0  rY   r   r   r   r   r   r   r   r   r   s   @r0   r$  r$    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	 r   r$  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r|   r}   r	   r   r   intermediate_sizer  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r0   r}   BertIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r=  r   r   s     r0   r   BertIntermediate.forward  s&    

=100?r   r@  r  r   s   @r0   r7  r7    s(    9U\\ ell  r   r7  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
BertOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r|   r}   r	   r   r9  r   r  r   r   r   r   r   r   s     r0   r}   BertOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r  s      r0   r   BertOutput.forward  r  r   r  r  r   s   @r0   rD  rD    r   r   rD  c                   *  ^  \ rS rSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\	\R                     4S jjrS rSrU =r$ )	BertLayeri  c                 t  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        g )Nr#   z> should be used as a decoder model if cross attention is addedru   r   )r|   r}   chunk_size_feed_forwardseq_len_dimr$  	attentionr   add_cross_attentionrW   crossattentionr7  intermediaterD  r(  r   s     r0   r}   BertLayer.__init__  s    '-'E'E$&v. ++#)#=#= ##?? D6)g!hii"/PZ"[D,V4 (r   r   r   r   r   r   r   r   r   c           	         Ub  US S OS nU R                  UUUUUS9n	U	S   n
U R                  (       a  U	SS nU	S   nOU	SS  nS nU R                  (       aZ  UbW  [        U S5      (       d  [        SU  S35      eUb  US	S  OS nU R	                  U
UUUUUU5      nUS   n
XSS -   nUS   nWU-   n[        U R                  U R                  U R                  U
5      nU4U-   nU R                  (       a  UW4-   nU$ )
Nr<   )r   r   r   r#   rw   rP  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	rN  r   r   rW   rP  r   feed_forward_chunkrL  rM  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr4  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r0   r   BertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!122 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 J    U R                  U5      nU R                  X!5      nU$ r   )rQ  r(  )r   r4  intermediate_outputr[  s       r0   rT  BertLayer.feed_forward_chunk]  s)    "//0@A{{#6Ir   )rO  rN  rL  rP  rQ  r   r(  rM  r   )r   r   r   r   r}   rY   r   r   r   r   r   r   rT  r   r   r   s   @r0   rJ  rJ    s    )" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?B r   rJ  c                   R  ^  \ rS rSrU 4S jr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\	\R                     \4   4S jjrSrU =r$ )BertEncoderic  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r|   r}   r]   r	   
ModuleListrangenum_hidden_layersrJ  layergradient_checkpointing)r   r]   r  r   s      r0   r}   BertEncoder.__init__d  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                 8   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  SOS n[        U R                  5       H  u  nnU	(       a  X4-   nUb  X?   OS nUb  Xo   OS nU R                  (       a4  U R                  (       a#  U R                  UR                  UUUUUUU5      nOU" UUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nU R                   R                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
Nr-   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   rw   r#   r<   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r-   )r.   vs     r0   r1   &BertEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stateri  r   
attentionscross_attentions)r]   rO  rg  r
  rB   r  	enumeraterf  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   ri  r   r   rj  rk  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r0   r   BertEncoder.forwardj  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4OA|#$58H$H!.7.CilO3B3N_/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::"  &9]1=M<O&O#;;222+?=QRCSBU+U(G  5J   14D D 
 "&%'(
 
 
 9+.+*1
 	
r   )r]   rg  rf  )	NNNNNNFFT)r   r   r   r   r}   rY   r   r   r   r   r   r   r   r   r   r   r   s   @r0   ra  ra  c  s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
 S
r   ra  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
BertPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r|   r}   r	   r   r   r  Tanh
activationr   s     r0   r}   BertPooler.__init__  s9    YYv1163E3EF
'')r   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r  )r   r   first_token_tensorpooled_outputs       r0   r   BertPooler.forward  s6     +1a40

#566r   )r  r  r  r   s   @r0   r  r    s(    $
U\\ ell  r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r  )r|   r}   r	   r   r   r  r:  r;  r<  r   transform_act_fnr   r   r   s     r0   r}   $BertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r   rA  s     r0   r   #BertPredictionHeadTransform.forward  s4    

=1--m<}5r   )r   r  r  r  r   s   @r0   r  r    s)    UU\\ ell  r   r  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BertLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)r8   )r|   r}   r  	transformr	   r   r   r   decoder	ParameterrY   r   r8   r   s     r0   r}   BertLMPredictionHead.__init__  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr   c                 :    U R                   U R                  l         g r   )r8   r  r   s    r0   _tie_weights!BertLMPredictionHead._tie_weights  s     IIr   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  rA  s     r0   r   BertLMPredictionHead.forward  s$    }5]3r   )r8   r  r  )	r   r   r   r   r}   r  r   r   r   r   s   @r0   r  r    s    && r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r   )r|   r}   r  predictionsr   s     r0   r}   BertOnlyMLMHead.__init__  s    /7r   sequence_outputr   c                 (    U R                  U5      nU$ r   r  )r   r  prediction_scoress      r0   r   BertOnlyMLMHead.forward  s     ,,_=  r   r  r  r   s   @r0   r  r    s(    8!u|| ! ! !r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertOnlyNSPHeadi  c                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr<   )r|   r}   r	   r   r   seq_relationshipr   s     r0   r}   BertOnlyNSPHead.__init__  s'     "		&*<*<a @r   c                 (    U R                  U5      nU$ r   r  )r   r  seq_relationship_scores      r0   r   BertOnlyNSPHead.forward  s    !%!6!6}!E%%r   r  r   r   r   r   r}   r   r   r   r   s   @r0   r  r    s    A& &r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPreTrainingHeadsi  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g r  )r|   r}   r  r  r	   r   r   r  r   s     r0   r}   BertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r   c                 L    U R                  U5      nU R                  U5      nX44$ r   r  r  )r   r  r  r  r  s        r0   r   BertPreTrainingHeads.forward  s-     ,,_=!%!6!6}!E 88r   r  r  r   s   @r0   r  r    s    A
9 9r   r  c                   .    \ rS rSr\r\rSrSr	Sr
S rSrg)BertPreTrainedModeli  bertTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr  )meanstdNg      ?)r:  r	   r   r5   r[   normal_r]   initializer_ranger8   zero_r~   rq   r   fill_r  )r   modules     r0   _init_weights!BertPreTrainedModel._init_weights  s3   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 455KK""$ 6r   r-   N)r   r   r   r   r$   config_classrl   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r   r-   r   r0   r  r    s"    L-O&*#N%r   r  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BertForPreTrainingOutputi2  a  
Output type of [`BertForPreTraining`].

Args:
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossprediction_logitsseq_relationship_logitsr   rq  r-   )r   r   r   r   r   r  r   rY   r   __annotations__r  r  r   r   rq  r   r-   r   r0   r  r  2  s~    2 )-D(5$$
%,59x 1 129;?Xe&7&78?8<M8E%"3"345<59Ju00129r   r  a  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                      ^  \ rS rSrSS/rSU 4S jjrS rS rS r\	             SS\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\R                        S\
\   S\
\   S\
\   S\
\   S\\\R                     \4   4S jj5       rSrU =r$ )	BertModeliT  rn   rJ  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        UR                  U l
        UR                  U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r|   r}   r]   rn   r   ra  encoderr  poolerr'  attn_implementationrt   	post_init)r   r]   add_pooling_layerr   s      r0   r}   BertModel.__init__c  sg    
 	 (0"6*,=j(4#)#>#> '-'E'E$ 	r   c                 .    U R                   R                  $ r   r   r   r  s    r0   get_input_embeddingsBertModel.get_input_embeddingsv  s    ...r   c                 $    XR                   l        g r   r  )r   r   s     r0   set_input_embeddingsBertModel.set_input_embeddingsy  s    */'r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rf  rN  r0  )r   heads_to_prunerf  r.  s       r0   _prune_headsBertModel._prune_heads|  s<    
 +002LELLu%//;;EB 3r   r   r   ry   rv   r   r   r   r   ri  r   r   rj  rk  r   c                 R   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nU	b  U	S   S   R                  S   OSnUcs  [        U R                  S5      (       a4  U R                  R                  S S 2S U24   nUR                  UU5      nUnO$[        R                   " U[        R"                  US9nU R                  UUUUUS	9nUc  [        R$                  " UUU-   4US
9nU R&                  S:H  =(       a(    U R(                  S:H  =(       a    US L =(       a    U(       + nU(       aT  UR+                  5       S:X  a@  U R                   R                  (       a  [-        UUUU5      nO'[/        UUR0                  US9nOU R3                  X.5      nU R                   R                  (       av  Ubs  UR                  5       u  nnnUU4nUc  [        R$                  " UUS
9nU(       a*  UR+                  5       S:X  a  [/        UUR0                  US9nOU R5                  U5      nOS nU R7                  XPR                   R8                  5      nU R;                  UUUUUU	U
UUUS9
nUS   nU R<                  b  U R=                  U5      OS nU(       d
  UU4USS  -   $ [?        UUUR@                  URB                  URD                  URF                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerw   z5You have to specify either input_ids or inputs_embedsr   r<   ry   r   )r   rv   ry   r   r   )r   r"  ru   )r  )	r   r   r   r   ri  r   r   rj  rk  r#   )rp  pooler_outputri  r   rq  rr  )$r]   r   rj  use_return_dictr   r   rW   %warn_if_padding_and_no_attention_maskr   r   rV   r   r   ry   r   rY   r   r   onesr  rt   r   r   r   r{   get_extended_attention_maskinvert_attention_maskget_head_maskre  r  r  r   ri  r   rq  rr  ) r   r   r   ry   rv   r   r   r   r   ri  r   r   rj  rk  r   
batch_sizer   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr  encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  r  s                                    r0   r   BertModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y++2O2OP	,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )r  r]   r   r  r  rt   )T)NNNNNNNNNNNNN)r   r   r   r   _no_split_modulesr}   r  r  r  r    r   rY   r   r   r   r   r   r   r   r   r   r   r   s   @r0   r  r  T  sx    *;7&/0C  -11515/3,0048<9==A$(,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
  (5L
 !) 6L
 "$u'8'8"9:L
 D>L
 $D>L
 'tnL
 d^L
 
uU\\"$PP	QL
 L
r   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                     ^  \ rS rSrSS/rU 4S jrS rS r\           SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )BertForPreTrainingi  predictions.decoder.biascls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r|   r}   r  r  r  clsr  r   s     r0   r}   BertForPreTraining.__init__  s4     f%	'/ 	r   c                 B    U R                   R                  R                  $ r   r   r  r  r  s    r0   get_output_embeddings(BertForPreTraining.get_output_embeddings&      xx##+++r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r   r  r  r8   r   new_embeddingss     r0   set_output_embeddings(BertForPreTraining.set_output_embeddings)  *    '5$$2$7$7!r   r   r   ry   rv   r   r   labelsnext_sentence_labelr   rj  rk  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUSS u  pU R                  X5      u  nnSnUbv  Ubs  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
    the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
    pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
```
Nr   ry   rv   r   r   r   rj  rk  r<   rw   )r  r  r  r   rq  )
r]   r  r  r   r   r   r   r  r   rq  )r   r   r   ry   rv   r   r   r  r  r   rj  rk  r   r  r  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr(  s                         r0   r   BertForPreTraining.forward-  sC   V &1%<k$++B]B]))))%'/!5#  

 *1!&48HH_4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//))
 	
r   r  r   NNNNNNNNNNN)r   r   r   r   _tied_weights_keysr}   r  r  r    r   rY   r   r   r   r   r  r   r   r   r   s   @r0   r  r    sC    56VW,8  -11515/3,004)-6:,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
 &L
 &ell3L
 $D>L
 'tnL
 d^L
 
uU\\"$<<	=L
 L
r   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            "         ^  \ rS rSrSS/rU 4S jrS rS r\              SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rS rSrU =r$ )BertLMHeadModeli}  zcls.predictions.decoder.biasr  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r|   r}   r   rB   warningr  r  r  r   r  r   s     r0   r}   BertLMHeadModel.__init__  sL       NNijf>	"6* 	r   c                 B    U R                   R                  R                  $ r   r  r  s    r0   r  %BertLMHeadModel.get_output_embeddings  r  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r	  s     r0   r  %BertLMHeadModel.set_output_embeddings  r  r   r   r   ry   rv   r   r   r   r   r  ri  r   r   rj  rk  r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUS9nUS   nU R                  U5      nSnU	b(  U R                  " UXR                   R
                  40 UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
NF)r   ry   rv   r   r   r   r   ri  r   r   rj  rk  r   r<   )r  logitsri  r   rq  rr  )r]   r  r  r   loss_functionr   r   ri  r   rq  rr  )r   r   r   ry   rv   r   r   r   r   r  ri  r   r   rj  rk  loss_kwargsr   r  r  lm_lossr(  s                        r0   r   BertLMHeadModel.forward  s   2 &1%<k$++B]B]I))))%'"7#9+/!5#  
  "!* HH_5(():FKKDZDZj^ijG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr-   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   r   )r.   
past_statebeam_idxs     r0   r1   1BertLMHeadModel._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)rv  )r   ri  r/  reordered_past
layer_pasts     `  r0   _reorder_cacheBertLMHeadModel._reorder_cache  s8    )Jncmnn N * r   r  )NNNNNNNNNNNNNN)r   r   r   r   r  r}   r  r  r    r   rY   r   r   r   r   r   r   r   r3  r   r   r   s   @r0   r  r  }  s    9:Z[
,8  -11515/3,0048<9=)-8<$(,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
  (5>
 !) 6>
 &>
 "$u||"45>
 D>>
 $D>>
 'tn>
 d^>
" 
uU\\"$EE	F#>
 >
@ r   r  c                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSS jr\S\4S j5       rSrU =r$ )BertForMaskedLMi  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  r   s     r0   r}   BertForMaskedLM.__init__  sR     NN1
 f>	"6* 	r   c                 B    U R                   R                  R                  $ r   r  r  s    r0   r  %BertForMaskedLM.get_output_embeddings  r  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r	  s     r0   r  %BertForMaskedLM.set_output_embeddings  r  r   r   r   ry   rv   r   r   r   r   r  r   rj  rk  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	bF  [	        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)
r   ry   rv   r   r   r   r   r   rj  rk  r   rw   r<   r  r&  r   rq  )
r]   r  r  r   r   r   r   r   r   rq  )r   r   r   ry   rv   r   r   r   r   r  r   rj  rk  r   r  r  r  r  r(  s                      r0   r   BertForMaskedLM.forward  s    . &1%<k$++B]B]))))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    UR                   nUS   nU R                  R                  c  [        S5      e[        R
                  " X"R                  UR                   S   S45      /SS9n[        R                  " US4U R                  R                  [        R                  UR                  S9n[        R
                  " X/SS9nXS.$ )Nr   z.The PAD token should be defined for generationr#   rw   r   r   )r   r   )
rV   r]   r   rW   rY   r   	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r0   prepare_inputs_for_generation-BertForMaskedLM.prepare_inputs_for_generation6  s    oo*1~ ;;##+MNNN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy6A>	&IIr   c                     g)z
Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
`prepare_inputs_for_generation` method.
Fr-   )r   s    r0   can_generateBertForMaskedLM.can_generateF  s     r   r  )NNNNNNNNNNNNr   )r   r   r   r   r  r}   r  r  r    r   rY   r   r   r   r   r   r   rF  classmethodrI  r   r   r   s   @r0   r6  r6    sj   46VW,8  -11515/3,0048<9=)-,0/3&*7
ELL)7
 !.7
 !.	7

 u||,7
 ELL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
rJ  T  r   r6  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForNextSentencePredictioniO  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r|   r}   r  r  r  r   r  r   s     r0   r}   &BertForNextSentencePrediction.__init__U  s4     f%	"6* 	r   r   r   ry   rv   r   r   r  r   rj  rk  r   c                    SU;   a,  [         R                  " S[        5        UR                  S5      nU
b  U
OU R                  R
                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUb2  [        5       nU" UR                  SS5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )	a"  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring). Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
```
r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r#   rw   r<   r>  )warningswarnFutureWarningpopr]   r  r  r   r   r   r   r   rq  )r   r   r   ry   rv   r   r   r  r   rj  rk  kwargsr   r  seq_relationship_scoresr  r  r(  s                     r0   r   %BertForNextSentencePrediction.forward^  s   T !F*MM%
 ZZ 56F%0%<k$++B]B]))))%'/!5#  

  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//))	
 	
r   r  
NNNNNNNNNN)r   r   r   r   r}   r    r   rY   r   r   r   r   r   r   r   r   r   s   @r0   rM  rM  O  s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   rM  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForSequenceClassificationi  c                 r  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r|   r}   
num_labelsr]   r  r  classifier_dropoutr   r	   r   r   r   r   r;   r  r   r]   r]  r   s      r0   r}   &BertForSequenceClassification.__init__  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   ry   rv   r   r   r  r   rj  rk  r   c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r#   
regressionsingle_label_classificationmulti_label_classificationrw   r<   r>  )r]   r  r  r   r;   problem_typer\  r{   rY   r   rT   r   squeezer   r   r
   r   r   rq  )r   r   r   ry   rv   r   r   r  r   rj  rk  r   r  r&  r  r  r(  s                    r0   r   %BertForSequenceClassification.forward  s   ( &1%<k$++B]B]))))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   )r  r;   r]   r   r\  rX  )r   r   r   r   r}   r    r   rY   r   r   r   r   r   r   r   r   r   s   @r0   rZ  rZ    s     -11515/3,004)-,0/3&*E
ELL)E
 !.E
 !.	E

 u||,E
 ELL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\\"$<<	=E
 E
r   rZ  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForMultipleChoicei  c                 0  > [         TU ]  U5        [        U5      U l        UR                  b  UR                  OUR
                  n[        R                  " U5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr#   )r|   r}   r  r  r]  r   r	   r   r   r   r   r;   r  r^  s      r0   r}   BertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	r   r   r   ry   rv   r   r   r  r   rj  rk  r   c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr#   rw   r   r  r<   r>  )r]   r  rV   r   r   r  r   r;   r   r   r   rq  )r   r   r   ry   rv   r   r   r  r   rj  rk  num_choicesr   r  r&  reshaped_logitsr  r  r(  s                      r0   r   BertForMultipleChoice.forward!  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   )r  r;   r   rX  )r   r   r   r   r}   r    r   rY   r   r   r   r   r   r   r   r   r   s   @r0   rh  rh    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   rh  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForTokenClassificationi}  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g NFr  )r|   r}   r\  r  r  r]  r   r	   r   r   r   r   r;   r  r^  s      r0   r}   #BertForTokenClassification.__init__  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   ry   rv   r   r   r  r   rj  rk  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   rw   r<   r>  )r]   r  r  r   r;   r   r   r\  r   r   rq  )r   r   r   ry   rv   r   r   r  r   rj  rk  r   r  r&  r  r  r(  s                    r0   r   "BertForTokenClassification.forward  s    $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   )r  r;   r   r\  rX  )r   r   r   r   r}   r    r   rY   r   r   r   r   r   r   r   r   r   s   @r0   rp  rp  }  s     -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\\"$99	:2
 2
r   rp  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rr  )
r|   r}   r\  r  r  r	   r   r   
qa_outputsr  r   s     r0   r}   !BertForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r   r   r   ry   rv   r   r   start_positionsend_positionsr   rj  rk  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r#   rw   r   )ignore_indexr<   )r  start_logits
end_logitsr   rq  )r]   r  r  ry  rM   re  r   rS   r   clampr   r   r   rq  )r   r   r   ry   rv   r   r   r{  r|  r   rj  rk  r   r  r&  r  r  r  ignored_indexr  
start_lossend_lossr(  s                          r0   r    BertForQuestionAnswering.forward  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )r  r\  ry  r  )r   r   r   r   r}   r    r   rY   r   r   r   r   r   r   r   r   r   s   @r0   rw  rw    s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\\"$@@	A>
 >
r   rw  )r6  rh  rM  r  rw  rZ  rp  rJ  r  r  r  rl   )Sr   r   rD   rQ  dataclassesr   typingr   r   r   r   rY   torch.utils.checkpoint	packagingr   r	   torch.nnr
   r   r   activationsr   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r    r!   r"   configuration_bertr$   
get_loggerr   rB   rl   Modulern   r   r   r  r&  r$  r7  rD  rJ  ra  r  r  r  r  r  r  r  r  r  r  r  r6  rM  rZ  rh  rp  rw  __all__r-   r   r0   <module>r     s      	  ! / /     A A ! ) w
 
 
 . l l L L * 
		H	%FR=RYY =@C		 CLb- bJRYY  ! 0BII 0fryy  S		 SlZ
")) Z
z ")) "299 .!bii !&bii &	9299 	9 %/ % %4 :{ : :B 	q
# q
q
h `
, `
`
F 
])? ]
]@ i) i iX 
\
$7 \

\
~ V
$7 V
V
r g
/ g
 g
T B
!4 B
 B
J J
2 J
 J
Zr   