
    fTh                     F   S r SSKrSSKJrJrJr  SSKrSSKrSSKJr  SSK	J
r
JrJrJr  SSK	Jr  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSKJr  SSKJrJr  SSKJ r   \RB                  " \"5      r#S&S jr$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r(\ " S S\5      5       r)\ " S S\)5      5       r*\" SS9 " S S\)\5      5       r+\" SS9 " S S \)5      5       r,\ " S! S"\)5      5       r-\ " S# S$\)5      5       r./ S%Qr/g)'zPyTorch MPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )GenerationMixin)!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfigc                 H   [         R                  " SU-
  S[         R                  US9R                  SSSU5      nS[        R
                  " [        R                  " U 5      5      -  n[         R                  " SUS-   [         R                  US9R                  5       nXbU-  -  nS[         R                  " SU5      -  nUR                  SUSS5      nXP:w  a7  [         R                  " USS2SSS2S4   USS2SSS2S4   /SS9SS2SU 2S4   nXG-  nUR                  S5      $ )	a  
Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr4   +   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D$889D599Q%%F[[0!Q7F(vaAsl3VAssCK5HIqQRSU_V_U_adRdeNE==    c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\R                  S\	\
\R                        S\	\R                     4S	 jjrS
rU =r$ )MptAttentionB   zrMulti-head self attention.
Using torch or triton attention implementation enables user to also use additive bias.
configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  R                  U l        U R                  c5  S[        R                  " U R                  U R                  -  5      -  U l        UR                  R                  U l        UR                  R                  U l        [        R                  " U R                  SU R                  -  SS9U l        [        R                  " U R                  U R                  SS9U l        g )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler$   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projselfr9   	__class__s     r3   r>   MptAttention.__init__G   s    !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Qr5   hidden_statesposition_biaspast_key_valueattention_maskc                    UR                   S S u  pVU R                  U5      nU R                  (       a%  UR                  U R                  * U R                  S9nUR	                  SSS9u  pn
UR                  XVU R                  U R                  5      R                  SS5      nU	R                  XVU R                  U R                  5      R                  SS5      n	U
R                  XVU R                  U R                  5      R                  SS5      n
UbG  [        U5      S:w  a4  [        R                  " US   U	/SS9n	[        R                  " US   U
/SS9n
X4nOX4n[        R                  " XR                  SS5      5      U R                  -  nUc  UOXcS   R                   S   -   nUb  [        UR                   5      S:w  a!  [        S	[        UR                   5       35      eU	R                   S   n[        SUR!                  S5      U-
  5      n[        SUR!                  S5      U-
  5      nUS S 2US 2US 24   nX-   nUb:  UR#                  U[        R$                  " UR&                  5      R(                  5      n[*        R,                  R/                  UR1                  5       SS9R3                  U
R&                  5      n[*        R,                  R5                  UU R6                  U R8                  S
9n[        R                  " UU
5      nUR;                  SSSS5      R=                  5       R?                  XVS5      nU RA                  U5      nUUU4$ )Nr   )minmaxr   r   r   r   z6Expecting position_bias shape to be 3 dimensions, got ptraining)!shaperK   rI   clampchunkreshaper@   rC   	transposelenr    catmatmulrE   
ValueErrorrW   sizemasked_fillfinfor   rV   r   r   softmaxr(   todropoutrH   r\   permute
contiguousr#   rL   )rN   rQ   rR   rS   rT   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                      r3   forwardMptAttention.forwardV   s    "/!4!4Ra!8
IIm,	==!T]]NNI1:1J.,#++JDLLRVR_R_`jjklnop''
dmm\ffghjkl
#++JDLLRVR_R_`jjklnop%>"a'"YYq(9:'FAN
$yy.*;\)JPQR(7N(7N <<6J6J2r6RSVZVhVhh%3%;z]^N_NeNefgNhAh$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjfhimmN3L.88r5   )	rK   rH   rI   rC   r?   rB   r@   rL   rE   )NN)__name__
__module____qualname____firstlineno____doc__r   r>   r    Tensorr   r   r|   __static_attributes____classcell__rO   s   @r3   r7   r7   B   sm    Ry R& 9=1559||59 ||59 !u||!45	59
 !.59 59r5   r7   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	MptMLP   r9   c                   > [         TU ]  5         UR                  n[        R                  " USU-  SS9U l        [        R                  " SS9U l        [        R                  " SU-  USS9U l        UR                  R                  U l        g )N   Fr;   none)approximate)r=   r>   r?   r   rJ   up_projGELUact	down_projrD   rG   hidden_dropoutrN   r9   r?   rO   s      r3   r>   MptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r5   rQ   residualreturnc                     U R                  U R                  U5      5      nU R                  U5      n[        R                  " X0R
                  U R                  S9nXB-   nU$ )NrZ   )r   r   r   Frk   r   r\   )rN   rQ   r   intermediate_outputoutputs        r3   r|   MptMLP.forward   sS    m!<="nn];.2E2EPTP]P]^"r5   )r   r   r   r   )r~   r   r   r   r   r>   r    r   r|   r   r   r   s   @r3   r   r      s:    <y <U\\ U\\ ell  r5   r   c                      ^  \ rS rSrS\4U 4S jjr   SS\R                  S\R                  S\R                  S\\	\R                  \R                  4      S\
S	\
4S
 jjrSrU =r$ )MptBlock   r9   c                   > [         TU ]  5         UR                  n[        X!R                  S9U l        S U R
                  l        UR                  U l        [        U5      U l
        [        X!R                  S9U l        S U R                  l        [        U5      U l        UR                  R                  U l        ["        R$                  " U R                   5      U l        g )Neps)r=   r>   r?   r	   layer_norm_epsilonnorm_1r<   r@   r,   r7   attnnorm_2r   ffnrD   rG   dropout_rater   Dropoutresid_attn_dropoutr   s      r3   r>   MptBlock.__init__   s    ((1J1JK (	1J1JK&>"..99"$**T->->"?r5   rQ   rR   rT   
layer_past	use_cacheoutput_attentionsc                     U R                  U5      nUnU R                  UUUUS9u  pnU R                  U	5      U-   nU R                  U5      nUnU R	                  Xx5      nU4nU(       a  X4-  nU(       a  X4-  nU$ )N)rR   rT   rS   )r   r   r   r   r   )rN   rQ   rR   rT   r   r   r   layernorm_outputr   attn_outputsry   rS   r   outputss                 r3   r|   MptBlock.forward   s      ;;}5  6:YY')%	 6? 6
2N //=H;;}5 ! *5)((G&Gr5   )r   r   r   r   r   r,   r   )NFF)r~   r   r   r   r   r>   r    r   r   r   boolr|   r   r   r   s   @r3   r   r      s    @y @2 CG"'(||( ||( 	(
 U5<<#=>?( (  ( (r5   r   c                      ^  \ rS rSr\rSrSrS/rS/r	U 4S jr
S\R                  4S jr\S	\\\R"                  \R"                  4      S
\\\R"                  \R"                  4      4S j5       rSrU =r$ )MptPreTrainedModel   transformerTr   z
lm_head.*.c                 &   > [         TU ]  " U0 UD6  g N)r=   r>   )rN   inputskwargsrO   s      r3   r>   MptPreTrainedModel.__init__   s    &+F+r5   modulec                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        5      (       aW  UR                  b$  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.g        )meanstdNr   )
isinstancer   rJ   weightdatanormal_r9   initializer_ranger<   zero_	Embeddingpadding_idxr	   fill_)rN   r   s     r3   _init_weights MptPreTrainedModel._init_weights   s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .	**{{&  &&(MM$$S) +r5   rS   r   c                 j   ^^^ U S   S   R                   u  pmmX-  m[        UUU4S jU  5       5      $ )zg
Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
r   c              3   |   >#    U  H1  nUS    R                  TTT5      US   R                  TTT5      4v   M3     g7fr   r   N)r`   ).0r   batch_size_times_num_headsrC   ro   s     r3   	<genexpr>;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>
  sL      

 -
 1%%&@(JW1%%&@*hW -s   9<)r]   tuple)rS   rn   r,   r   rC   ro   s      @@@r3   _convert_to_mpt_cache(MptPreTrainedModel._convert_to_mpt_cache   sI     7EQ6G6J6P6P3
x%/%;"  

 -
 
 	
r5    )r~   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr>   r   Moduler   staticmethodr   r    r   r   r   r   r   s   @r3   r   r      s    L%&*##'4o#,*BII *" 
eELL%,,$>?@
	uU\\5<</0	1
 
r5   r   c                   ~  ^  \ rS rSrS\4U 4S jjrS rSS jrS\R                  4S jr
\        SS\\R                     S	\\\\R                  \R                  4   S
4      S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\R                  S
4   \4   4S jj5       rSrU =r$ )MptModeli  r9   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        U R                  UR                  S9U l        S U R                   l        SU l        U R'                  5         g s  snf )Nr   F)r=   r>   r?   r@   r,   r   r   
vocab_sizewte
ModuleListrangen_layersr   blocksr	   r   norm_fr<   gradient_checkpointing	post_init)rN   r9   _rO   s      r3   r>   MptModel.__init__  s     !-- << 1 143C3CD mmuV__?U$V?U!Xf%5?U$VW   0 0f6O6OP&+# 	 %Ws   
C7c                     U R                   $ r   r   rN   s    r3   get_input_embeddingsMptModel.get_input_embeddings+  s    xxr5   c                     [        XX45      $ r   )r4   )rN   r,   r-   r.   r   s        r3   r4   MptModel.build_mpt_alibi_tensor.  s    %i.YYr5   new_embeddingsc                     Xl         g r   r   rN   r   s     r3   set_input_embeddingsMptModel.set_input_embeddings1  s    !r5   	input_idspast_key_values.rT   inputs_embedsr   r   output_hidden_statesreturn_dictr   c	           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eUc"  [        S/[        U R                  5      -  5      nUc  U R                  U5      nUnU(       a  SOSnU(       a  SOSnU(       a  SOSnU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUnSnUS   b  US   S   R                  S   nUU-   nUc"  [        R                   " U
U4UR"                  S	9nOUR%                  UR"                  5      nU R'                  U R(                  U R                   R*                  UR"                  S	9n[-        X:U4UU5      nUR/                  5       n[1        U R                  U5       H  u  nnU(       a  UU4-   nU R                  (       a3  U R                  (       a"  U R3                  UR4                  UUUUUU5      nOU" UUUUUUS
9nUS   nUSL a	  UUS   4-   nU(       d  M~  UUU(       a  SOS   4-   nM     U R7                  U5      nU(       a  UU4-   nU(       d  [        S XUU4 5       5      $ [9        UUUUS9$ )h  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r   rT   r   r   rR   Tr   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   )r   vs     r3   r   #MptModel.forward.<locals>.<genexpr>  s     w$eq$es   	)last_hidden_stater   rQ   
attentions)r9   r   r   r   use_return_dictre   r]   r   rb   r   r   r   r\   loggerwarning_oncer    onesr   rj   r4   r,   rA   r   r   zip_gradient_checkpointing_func__call__r   r   )rN   r   r   rT   r   r   r   r   r   r   rn   ro   r   rQ   presentsall_self_attentionsall_hidden_statesseq_length_with_pastpast_key_values_lengthr/   causal_maskblockr   r   s                           r3   r|   MptModel.forward4  s   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%JATUU"#TFS-=$=>O  HHY/M%"2$5b4"6BD&&4==##p "	  *!"1)%4Q%7%:%@%@%C"#7:P#P !"ZZ5I(JS`SgSghN+..}/C/CDN++DNNDKK<S<S\i\p\p+q74mE[
 "&&(!$T[[/!BE:#$58H$H!**t}};;NN!%  !)#.'&7"' $AJMD #wqzm3  &9W)QYZ=[<]&]#; "C@ M2 1]4D Dw]>OQd$ewww8+$+*	
 	
r5   )r   r   r?   r   r,   r      NNNNNNNNN)r~   r   r   r   r   r>   r   r4   r    r   r   r   r   
LongTensorr   r   r   r   r|   r   r   r   s   @r3   r   r     s$   y ,Z"5<< "  15SW1548$(,0/3&*{
E,,-{
 "%ellELL.H(I3(N"OP{
 !.	{

   0 01{
 D>{
 $D>{
 'tn{
 d^{
 
uU\\3&')RR	S{
 {
r5   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   8  ^  \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\         SS\\R                     S	\\\\R                  \R                  4   S
4      S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rS\\\R                  \R                  4   S
4   S\R                  S\\\R                  \R                  4   S
4   4S jrSrU =r$ )MptForCausalLMi  zlm_head.weightr9   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr;   )
r=   r>   r   r   r   rJ   r?   r   lm_headr   rM   s     r3   r>   MptForCausalLM.__init__  sI     #F+yy!3!3V5F5FUS 	r5   c                     U R                   $ r   r  r   s    r3   get_output_embeddings$MptForCausalLM.get_output_embeddings  s    ||r5   r   c                     Xl         g r   r   r   s     r3   set_output_embeddings$MptForCausalLM.set_output_embeddings  s    %r5   r   r   .rT   r   labelsr   r   r   r   r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUbE  UR	                  UR
                  5      nU R                  " UU4SU R                   R                  0U
D6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Nr   rT   r   r   r   r   r   r   r   r   losslogitsr   rQ   r  )r9   r  r   r  rj   r   loss_functionr   r   r   rQ   r  )rN   r   r   rT   r   r&  r   r   r   r   r   transformer_outputsrQ   	lm_logitsr*  r   s                   r3   r|   MptForCausalLM.forward  s   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.LL/	YYy//0F%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r5   pastbeam_idxc           	         ^ U VVs0 s H1  o3  H(  oDR                   UR                  UR                   5      _M*     M3     snnm[        U4S jU 5       5      nU$ s  snnf )a$  
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.

Output shares the same memory storage as `past`.
c              3      >#    U  HO  nUS    R                  S TUS    R                     5      US   R                  S TUS    R                     5      4v   MQ     g7fr   )index_selectr   )r   r   device_to_beam_idxs     r3   r   0MptForCausalLM._reorder_cache.<locals>.<genexpr>  sf      

 #
 1**1.@AAUAU.VW1**1.@AAUAU.VW #s   AA)r   rj   r   )rN   r0  r1  r   
past_statereordered_pastr5  s         @r3   _reorder_cacheMptForCausalLM._reorder_cache  sn     QU
PT*gqYcx{{:+<+<==gqPT
  

 #
 
 
s   8A)r  r   	NNNNNNNNN)r~   r   r   r   _tied_weights_keysr   r>   r!  r    r   r$  r   r   r  r   r   r   r   r|   r9  r   r   r   s   @r3   r  r    s    ++y &ELL &  15SW1504)-$(,0/3&*D
E,,-D
 "%ellELL.H(I3(N"OPD
 !.	D

  -D
 &D
 D>D
 $D>D
 'tnD
 d^D
 
uU\\"$EE	FD
 D
L%ell :;S@AMRM]M]	uU\\5<</0#5	6 r5   r  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   h  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )MptForSequenceClassificationi)  r9   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r=   r>   
num_labelsr   r   r   rJ   r?   scorer   rM   s     r3   r>   %MptForSequenceClassification.__init__8  sV      ++#F+YYv1163D3D5Q
 	r5   r   r   .rT   r   r&  r   r   r   r   r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [5        UUU
R6                  U
R8                  U
R:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr(  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rX   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr)  )r9   r  r   rA  r]   pad_token_idre   rj   r   r    r"   r!   argmaxr  r  rO   r~   problem_typer@  r   longintr
   r+   r   r   r   r   rQ   r  )rN   r   r   rT   r   r&  r   r   r   r   r-  rQ   r+  rn   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr*  loss_fctr   s                        r3   r|   $MptForSequenceClassification.forwardA  s   < &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r5   )r@  rA  r   r;  )r~   r   r   r   r   r>   r   r   r    r  r   r   r   r   r   r|   r   r   r   s   @r3   r>  r>  )  s   y   15SW1504)-$(,0/3&*d
E,,-d
 "%ellELL.H(I3(N"OPd
 !.	d

  -d
 &d
 D>d
 $D>d
 'tnd
 d^d
 
uU\\"$DD	Ed
 d
r5   r>  c                   h  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
\
\R                  \R                  4   S4      S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\   S\\
\R                     \4   4S jj5       rSrU =r$ )MptForTokenClassificationi  r9   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)r=   r>   r@  r   r   hasattrrV  r   r   r   rk   rJ   r?   
classifierr   )rN   r9   rV  rO   s      r3   r>   "MptForTokenClassification.__init__  s      ++#F+6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r5   r   r   .rT   r   r&  r   r   r   r   r   c
                 
   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbl  UR                  UR                  5      nUR                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )rD  Nr(  r   r   )r*  r+  rQ   r  )r9   r  r   rk   rX  rj   r   r]   r   r#   r@  r   rQ   r  )rN   r   r   rT   r   r&  r   r   r   r   deprecated_argumentsr-  rQ   r+  r*  rn   ro   rQ  r   s                      r3   r|   !MptForTokenClassification.forward  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r5   )rX  rk   r@  r   r;  )r~   r   r   r   r   r>   r   r   r    r  r   r   r   r   r   r|   r   r   r   s   @r3   rT  rT    s   y "  15SW1504)-$(,0/3&*B
E,,-B
 "%ellELL.H(I3(N"OPB
 !.	B

  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r5   rT  c                     ^  \ rS rSrU 4S jr\        SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\
   S\\\4   4S jj5       rSrU =r$ )MptForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr   )	r=   r>   r   r   r   rJ   r?   
qa_outputsr   rM   s     r3   r>    MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r5   r   rT   r   start_positionsend_positionsr   r   r   r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U	SS -   nUb  U4U-   $ U$ [        UUUU	R                  U	R                  S	9$ )
r   N)rT   r   r   r   r   r   r   rX   r   )ignore_indexr   )r*  start_logits
end_logitsrQ   r  )r9   r  r   r`  splitr+   rm   rb   rf   r^   r   r   rQ   r  )rN   r   rT   r   rb  rc  r   r   r   r   sequence_outputr+  rf  rg  
total_lossignored_indexrQ  
start_lossend_lossr   s                       r3   r|   MptForQuestionAnswering.forward  s   2 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r5   )r`  r   r  )r~   r   r   r   r>   r   r   r    r  FloatTensorr   r   r   r   r|   r   r   r   s   @r3   r^  r^    s      156:596:48,0/3&*E
E,,-E
 !!2!23E
   1 12	E

 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
u22	3E
 E
r5   r^  )r  r   r   r>  rT  r^  r  )0r   r$   typingr   r   r   r    torch.utils.checkpointr   torch.nnr   r   r	   r
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr~   r  r4   r   r7   r   r   r   r   r  r>  rT  r^  __all__r   r5   r3   <module>r{     sd     ) )    L L $ ) I  . , ( 
		H	%.I9299 I9XRYY *=ryy =@ ,
 ,
 ,
^ \
! \
 \
~ m' mm` o
#5 o
o
d U
 2 U
 U
p O
0 O
 O
dr5   