
    fTh=A                    @   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJrJrJr  SSKJrJr  SS	KJrJrJrJr  SS
KJr  SSKJrJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5  SSK6J7r7  \0Rp                  " \95      r:S r;S5S jr< " S S\Rz                  5      r> " S S\Rz                  5      r? " S S\Rz                  5      r@ " S S\Rz                  5      rA\/ " S S\'5      5       rB\ " S  S!\-5      5       rCS"rDS#rE\/ " S$ S%\B5      5       rF\/" S&S'9 " S( S)\B\5      5       rG\/" S*S'9 " S+ S,\B\5      5       rH\/" S-S'9 " S. S/\B5      5       rI\/ " S0 S1\B5      5       rJ\/ " S2 S3\B5      5       rK/ S4QrLg)6zPyTorch OpenAI GPT-2 model.    N)	dataclass)CallableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationMixin)AttentionMaskConverter#_prepare_4d_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)ModelOutputadd_start_docstringsauto_docstringlogging)deprecate_kwarg)assert_device_mapget_device_map   )
GPT2Configc                 B    SSK nSSKn[
        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ nU Hk  u  p[        R                  SU	 SU
 35        UR                  R                  XY5      nUR                  U	5        UR                  UR                  5       5        Mm     [        Xx5       GH\  u  pU	SS n	U	R                  S5      n	U nU	 H  nUR!                  S	U5      (       a  UR                  S
U5      nOU/nUS   S:X  d	  US   S:X  a  [#        US5      nOQUS   S:X  a  [#        US5      nO;US   S:X  d	  US   S:X  a  [#        XS   5      n[#        US5      nO[#        XS   5      n[%        U5      S:  d  M  ['        US   5      nX   nM      UR(                  UR(                  :w  a&  [+        SUR(                   SUR(                   S35      e [        R                  SU	 35        [.        R0                  " U5      Ul        GM_     U $ ! [         a    [        R	                  S5        e f = f! [*         a1  nU=R,                  UR(                  UR(                  4-  sl        e SnAff = f)z&Load tf checkpoints in a pytorch modelr   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /z[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte   r'   zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplit	fullmatchgetattrlenintshape
ValueErrorargstorch
from_numpydata)modelconfiggpt2_checkpoint_pathr4   tftf_path	init_varsnamesarraysnamerH   arraypointerm_namescope_namesnumes                    ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gpt2/modeling_gpt2.pyload_tf_weights_in_gpt2r^   8   st   	 ggoo23G
KK8	BC''0IEF (l5'BC&&w5Temmo&	 ! 5)ABxzz#F||OV44 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!'q>:!'84!'q>:;1$+a.)!,! "	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.9 *: L_  Q	
 	T  	FFw}}ekk22F	s$   H? A I#?!I #
J-,JJc                 0   [         R                  " XR                  SS5      5      nU R                  (       a@  U[         R                  " / UR                  S5      S-  UR                  UR                  S9-  nU R                  (       a  U[        U R                  S-   5      -  nU R                  (       d  UR                  S5      UR                  S5      pU R                  S S 2S S 2X-
  U	2S U	24   n
[         R                  " UR                  5      R                  n[         R                  " / XR                  UR                  S9n[         R                  " XR!                  UR                  5      U5      nUb"  US S 2S S 2S S 2S UR"                  S   24   n
Xz-   n[$        R&                  R)                  USS9nUR+                  UR                  5      nU R-                  U5      nUb  Xu-  n[         R                  " Xs5      nUR                  SS5      nX4$ )N      ?dtypedevicer'   dimr3   )rK   matmul	transposescale_attn_weightsfullsizerd   re   scale_attn_by_inverse_layer_idxfloat	layer_idxis_cross_attentionr0   finfominwheretorH   r   
functionalsoftmaxtypeattn_dropout)modulequerykeyvalueattention_mask	head_maskkwargsattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs                r]   eager_attention_forwardr   p   s   <<}}R'<=L  #ejj

2#%\-?-?H[H['
 

 --#eF,<,<q,@&AA$$#(::b>388B<jkk!Q
(AJ(NP[Q[P["[\[[!3!3488
 ZZJ6H6HQ]QdQde
{{;@R@R0SU_`!$Q1o		"o%=>#1==((2(>L  $$U[[1L&&|4L #/,,|3K''1-K$$    c                     ^  \ rS rSrSU 4S jjrS rSS jr\" SSSSS	9       SS
\\	\
R                        S\\   S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\   S\	\\
R                  \	\
R                     4   S4   4S jj5       rSrU =r$ )GPT2Attention   c           
        > [         TU ]  5         Xl        UR                  nU R	                  S[
        R                  " [
        R                  " XD4[
        R                  S95      R                  SSXD5      SS9  U R	                  S[
        R                  " S5      SS9  UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U l        U R                  U R                  -  U R                  :w  a&  [#        SU R                   S	U R                   S
35      eUR$                  U l        X l        UR(                  U l        X0l        UR,                  U l        U R&                  (       aN  [/        SU R                  -  U R                  5      U l        [/        U R                  U R                  5      U l        O([/        SU R                  -  U R                  5      U l        [/        U R                  U R                  5      U l        [6        R8                  " UR:                  5      U l        [6        R8                  " UR>                  5      U l         SU l!        [E        5       U l#        g )Nr0   rd   r'   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r3   r   T)$super__init__rO   max_position_embeddingsregister_bufferrK   trilonesboolviewtensorhidden_size	embed_dimnum_attention_heads	num_headshead_dim
split_sizerI   rj   rp   rm   ro   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdroprx   resid_pdropresid_dropout	is_causalsetpruned_heads)selfrO   rp   ro   max_positions	__class__s        r]   r   GPT2Attention.__init__   s   66JJuzz="@

STYY1m  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er   c                 8   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[
        R                  " X"U R                  -   USU R                  -  -   /5      n[        U R                  USS9U l	        [        U R                  USS9U l
        U R                  U R                  -  U R                  [        U5      -
  -  U l        U R                  [        U5      -
  U l        U R                  R                  U5      U l        g )Nr   r3   r'   rf   )rF   r   r   r   r   rK   catr   r   r   r   union)r   headsindex
index_attns       r]   prune_headsGPT2Attention.prune_heads   s    u:?7~~t}}^b^o^opYYt'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r   c           	         UR                  5       u  pgpUR                  5       u    pn
[        R                  " Xg-  X[        R                  UR                  S9nSnU R
                  (       a   U[        UR                  S5      5      S-  -  nU R                  (       a  U[        U R                  S-   5      -  n[        R                  R                  UR                  R                  SS9   UR                  SX5      UR                  SS5      R                  SX5      p[        R                  " XR                  5       UR                  5       S	US
9nUR                  XgX5      nS S S 5        U R                  (       d  UR                  S5      UR                  S5      nnU R                   S S 2S S 2UU-
  U2S U24   n[        R"                  " UR$                  5      R&                  n[        R(                  " UUR$                  UR                  S9n[        R*                  " UUU5      nUb  X-   n[,        R.                  R1                  USS9nUR$                  [        R                  :w  a  [3        S5      eUR                  UR$                  5      nU R5                  U5      nUb  X-  n[        R6                  " X5      nUR                  SS5      nUU4$ ! , (       d  f       GNy= f)Nrc         ?r`   rb   r'   F)enabledra   r   )betaalpharf   zDError with upcasting, attn_weights does not have dtype torch.float32r3   )rl   rK   emptyfloat32re   rj   rn   rm   ro   ampautocastrw   reshaperi   baddbmmrp   r0   rq   rd   rr   r   rs   r   ru   rv   RuntimeErrorrx   rh   )r   rz   r{   r|   r}   r~   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r]   _upcast_and_reordered_attn(GPT2Attention._upcast_and_reordered_attn   sj   (-

%	 XXZ1 {{3?IPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L YY 1 15A==Y3S]]2r5J5R5RSUWY5eq ==wwy!'')RS[ghL'//	UL B
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'8L}},,\r,B .eff#((5((6  '3Lll<7!++Aq1L((E BAs   *A9K
K$
layer_pastpast_key_value4.53.0Tnew_nameversionraise_if_both_nameshidden_statescache_positionr}   r~   encoder_hidden_statesencoder_attention_maskoutput_attentionsreturn.c	                    US Ln
U
(       a[  [        U S5      (       d  [        S5      eU R                  U5      nU R                  U5      R	                  U R
                  SS9u  pUnO,U R                  U5      R	                  U R
                  SS9u  pn/ UR                  S S QSPU R                  P7n/ UR                  S S QSPU R                  P7nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUbV  [        U[        5      (       a   U
(       a  UR                  nOUR                  nSU0nUR                  XU R                  US9u  pUS L =(       a     UR                  S	   S:  =(       a    U
(       + nU R                   R"                  S
:H  n[$        nU R                   R"                  S
:w  aY  U R                   R"                  S:X  a"  U(       d  Ub  Sn[&        R)                  S5        O[*        U R                   R"                     nU(       a(  U R,                  (       a  U R/                  XXU5      u  nnO;U" U UUUU4UU R0                  (       a  U R2                  R4                  OSUS.U	D6u  nnUR6                  " / UR                  S S	 QSP76 R9                  5       nU R;                  U5      nU R=                  U5      nUU4$ )Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`.r3   rf   r`   r'   r   )cache_kwargsra   eagersdpaTz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r~   dropoutr   )hasattrrI   r   r   rC   r   rH   r   r   ri   
isinstancer   cross_attention_cacheself_attention_cacheupdatero   rO   _attn_implementationr   r7   warning_oncer   r   r   trainingrx   pr   
contiguousr   r   )r   r   r   r   r}   r~   r   r   r   r   rp   query_states
key_statesvalue_statesshape_qshape_kvr   r   using_eagerattention_interfacer   r   s                         r]   forwardGPT2Attention.forward  s    3$>4** p 
  ;;}5L'+{{3H'I'O'OPTP_P_ef'O'g$J3N59[[5O5U5UVZVeVekl5U5m2Ll?L&&s+?R??>Z%%cr*>B>>#((1;;AqA__X.88A>
#((2<<QB%.*=>>%%3%I%IN%3%H%HN,n=L'5'<'<$..| (= ($J #d*d|/A/A"/E/IdRdNd	kk66'A(?;;++w6{{//69?PT]Ti"##L '>dkk>^>^&_#477(,(G(G,	)%K )<
) $/3}}))++##
) 
)%K "))F;+<+<Sb+AF2FQQSkk+.((5L((r   )rx   r   r   rO   r   r   r   rp   ro   r   r   r   r   r   rm   rj   r   )FN)NN)NNNNNNF)__name__
__module____qualname____firstlineno__r   r   r   r$   r   r   rK   FloatTensorr   
LongTensorTensorr   r   r   __static_attributes____classcell__r   s   @r]   r   r      s   *"X;3)j \,<hdhi +/596:158<>B,1R)e&7&7 89R) !R) !!1!12	R)
 !!2!23R) E--.R)  (5R) !)):): ;R) $D>R) 
uU\\5#667<	=R) jR)r   r   c                   n   ^  \ rS rSrU 4S jrS\\\R                        S\R                  4S jr	Sr
U =r$ )GPT2MLPib  c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g N)r   r   r   r   c_fcr   r   activation_functionactr   r   r   r   )r   intermediate_sizerO   r   r   s       r]   r   GPT2MLP.__init__c  sZ    &&	,8	Y:&445zz&"4"45r   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r  r   r   )r   r   s     r]   r   GPT2MLP.forwardk  s@    		-0/M2]3r   )r  r   r   r   )r   r   r   r   r   r   r   rK   r   r   r   r   r   s   @r]   r   r   b  s6    6XeE4E4E.F%G EL]L]  r   r   c                     ^  \ rS rSrSU 4S jjr\" SSSSS9        SS\\\R                        S\\
   S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\\R                     \\\R                  \\R                  S4   4      4   4S jj5       rSrU =r$ )	GPT2Blockis  c                   > [         TU ]  5         UR                  nUR                  b  UR                  OSU-  n[        R
                  " X1R                  S9U l        [        XS9U l	        [        R
                  " X1R                  S9U l
        UR                  (       a3  [        USUS9U l        [        R
                  " X1R                  S9U l        [        XA5      U l        g )N   eps)rO   ro   T)rO   rp   ro   )r   r   r   n_innerr   	LayerNormlayer_norm_epsilonln_1r   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r   rO   ro   r   	inner_dimr   s        r]   r   GPT2Block.__init__t  s    ((&,nn&@FNNa+o	LL2K2KL	!E	LL2K2KL	%%"/vRVbk"lD!#k?X?X!YD9-r   r   r   r   Tr   r   r   r}   r~   r   r   	use_cacher   r   .c
           
         UnU R                  U5      nU R                  " U4UUUUUU	S.U
D6u  pX-   nUbN  [        U S5      (       d  [        SU  S35      eUnU R	                  U5      nU R                  UUUUUUU	S9u  pX-   nUnU R                  U5      nU R                  U5      nUU-   nU4nU	(       a  UU4-  nUb  UW4-  nU$ )N)r   r   r}   r~   r  r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r}   r~   r   r   r   )r  r  r   rI   r  r  r  r  )r   r   r   r   r}   r~   r   r   r  r   r   residualr   self_attn_weightscross_attn_outputcross_attn_weightsfeed_forward_hidden_statesoutputss                     r]   r   GPT2Block.forward  s:    !		-0)-	*
)))/	*
 	*
& $. ,4!122 =dV DZ Z  %H ..}=M484G4G--#&;'="3 5H 51 %8M 		-0%)XXm%<" #== ")++G$0.00r   )r  r  r  r  r  r  r   )NNNNNNFF)r   r   r   r   r   r$   r   r   rK   r   r   r   r   r   r   r   r   r   r   s   @r]   r  r  s  s.   . \,<hdhi +/596:158<>B$),1=e&7&7 89= != !!1!12	=
 !!2!23= E--.=  (5= !)):): ;= D>= $D>= 
uU\\"HU5<<uGXGXZ]G]A^3^-_$``	a= j=r   r  c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjrS	rU =r$ )GPT2SequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`GPT2Config`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
rO   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastr  summary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r   r   rE   r$  NotImplementedErrorr   Identitysummaryr   r&  r'  
num_labelsr   Linearr   
activationfirst_dropoutr)  r   last_dropoutr*  )r   rO   num_classesactivation_stringr   s       r]   r   GPT2SequenceSummary.__init__  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2r   r   	cls_indexr   c                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
r%  Nr`   firstr   meanr'   rf   r6  .ra   r   r`   r  )r$  r9  rK   	full_likerH   long	unsqueezeexpandrg   rl   gatherrA   r+  r1  r-  r0  r2  )r   r   r6  outputs       r]   r   GPT2SequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*r   )r0  r1  r2  r-  r$  r   )r   r   r   r   __doc__r(   r   rK   r   r   r   r   r   r   r   s   @r]   r"  r"    sV    2Hz H< Y])"..);CEDTDT;U)			) )r   r"  c                   `   ^  \ rS rSr\r\rSrSr	Sr
S/rSrSrSrSrSrSrU 4S jrS rSrU =r$ )	GPT2PreTrainedModeli(  transformerTr  past_key_valuesc                 &   > [         TU ]  " U0 UD6  g r   )r   r   )r   inputsr   r   s      r]   r   GPT2PreTrainedModel.__init__7  s    &+F+r   c           	         [        U[        R                  [        45      (       aj  UR                  R
                  R                  SU R                  R                  S9  UR                  b$  UR                  R
                  R                  5         O[        U[        R                  5      (       aw  UR                  R
                  R                  SU R                  R                  S9  UR                  b1  UR                  R
                  UR                     R                  5         Oh[        U[        R                  5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  S5        UR                  5        Hi  u  p#US:X  d  M  UR
                  R                  SU R                  R                  [         R"                  " SU R                  R$                  -  5      -  S9  Mk     g)zInitialize the weights.r   )r9  stdNr   zc_proj.weightr3   )r   r   r/  r   r.   rM   normal_rO   initializer_ranger0   zero_	Embeddingpadding_idxr  fill_named_parametersmathsqrtn_layer)r   ry   rV   r   s       r]   _init_weights!GPT2PreTrainedModel._init_weights:  se   fryy&122 MM&&CT[[5R5R&S{{&  &&(--MM&&CT[[5R5R&S!!-""6#5#56<<>--KK""$MM$$S) ..0GD&Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr 1r    )r   r   r   r   r(   config_classr^   load_tf_weightsbase_model_prefixis_parallelizablesupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_attention_backend_supports_cache_class_supports_static_cacher   rV  r   r   r   s   @r]   rD  rD  (  s[    L-O%&*#$"3!N"& !,s sr   rD  c                   J   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\\R                           \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)GPT2DoubleHeadsModelOutputiV  a!  
Base class for outputs of models predicting if two sentences are consecutive or not.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
        Multiple choice classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
    past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
        sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
        self-attention heads.
Nlossmc_losslogits	mc_logitsrF  r   
attentionsrX  )r   r   r   r   rB  rg  r   rK   r   __annotations__rh  ri  rj  rF  r   r   rk  r   rX  r   r]   rf  rf  V  s    > )-D(5$$
%,+/GXe''(/*.FHU&&'.-1Ix))*1AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129r   rf  a  
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.

    Args:
        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
            following number of attention modules:

                - openai-community/gpt2: 12
                - openai-community/gpt2-medium: 24
                - openai-community/gpt2-large: 36
                - openai-community/gpt2-xl: 48

    Example:

    ```python
    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
    }
    model.parallelize(device_map)
    ```
aq  
    Moves the model to cpu from a model parallel state.

    Example:

    ```python
    # On a 4 GPU machine with openai-community/gpt2-large:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7],
        1: [8, 9, 10, 11, 12, 13, 14, 15],
        2: [16, 17, 18, 19, 20, 21, 22, 23],
        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
c            "         ^  \ rS rSrSrU 4S jr\" \5      S!S j5       r\" \	5      S 5       r
S rS rS r\              S"S	\\R"                     S
\\\\\R(                        \4      S\\R"                     S\\R,                     S\\R"                     S\\R"                     S\\R,                     S\\R,                     S\\R(                     S\\R,                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rS\R(                  S\R(                  S\R(                  S
\S\4
S jr\S\R(                  S\S\S\R:                  S\R(                  S\4S j5       rS rU =r $ )#	GPT2Modeli  Fc           
        > [         TU ]  U5        UR                  U l        [        R
                  " UR                  U R                  5      U l        [        R
                  " UR                  U R                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        XS9PM     sn5      U l        [        R$                  " U R                  UR&                  S9U l        SU l        S U l        SU l        UR0                  U l        U R3                  5         g s  snf )N)ro   r
  F)r   r   r   r   r   rO  
vocab_sizer2   r   r1   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hr  r  ln_fmodel_parallel
device_mapgradient_checkpointingr   	post_init)r   rO   ir   s      r]   r   GPT2Model.__init__  s     ++<< 1 14>>B<< > >OJJv001	fNfNfHghHg1	& >HghiLLV5N5NO	 $&+#$*$?$?! 	  is   Ec                    [         R                  " S[        5        UcD  [        [	        U R
                  5      [        [        R                  R                  5       5      5      OUU l
        [        U R                  [	        U R
                  5      5        SU l        SU R                  R                  5       ;   a  SO.S[        [        U R                  R                  5       5      5      -   U l        S[        [#        U R                  R                  5       5      5      -   U l        U R&                  R)                  U R                   5      U l        U R*                  R)                  U R                   5      U l        U R                  R-                  5        HG  u  p#U H<  nS[        U5      -   nU R
                  U   R)                  U5      U R
                  U'   M>     MI     U R.                  R)                  U R$                  5      U l        g )Na6  `GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1, ...}Tcpucuda:)warningswarnFutureWarningr&   rF   rv  rt  rK   cudadevice_country  r%   rx  keysstrrr   first_devicemaxlast_devicer2   rt   r1   itemsrw  )r   ry  r   vblockcuda_devices         r]   parallelizeGPT2Model.parallelize  su    	 	
 NXM_N3tvv;ejj.E.E.G(HIeo 	 	$//3tvv;7"%*doo.B.B.D%DE'TWX[\`\k\k\p\p\rXsTtJt"ST__-A-A-C)D%EE88;;t00188;;t001OO))+DA%A. $u 0 0 =u  ,
 IILL!1!12	r   c                    [         R                  " S[        5        SU l        S U l        SU l        SU l        U R                  R                  S5      U l        U R                  R                  S5      U l	        [        [        U R                  5      5       H.  nU R                  U   R                  S5      U R                  U'   M0     U R                  R                  S5      U l        [        R                  R!                  5         g )N\Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.Fr  )r  r  r  rx  ry  r  r  r2   rt   r1   rt  rF   rv  rw  rK   r  empty_cache)r   r   s     r]   deparallelizeGPT2Model.deparallelize  s    j	
 $! 88;;u%88;;u%3tvv;'E FF5M,,U3DFF5M (IILL'	

 r   c                     U R                   $ r   r2   r   s    r]   get_input_embeddingsGPT2Model.get_input_embeddings  s    xxr   c                     Xl         g r   r  r   new_embeddingss     r]   set_input_embeddingsGPT2Model.set_input_embeddings  s    !r   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)r  rv  r  r   )r   heads_to_prunelayerr   s       r]   _prune_headsGPT2Model._prune_heads  s5     +002LEFF5M**51 3r   	input_idsrF  r   r}   token_type_idsposition_idsr~   inputs_embedsr   r   r  r   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nUb  UR                  SUS   5      nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       a  Uc  Sn[        5       nOB[!        U["        5      (       d-  Sn[        R                  S	5        [        R$                  " U5      nU R                   R&                  (       a)  [!        U[(        5      (       d  [)        U[        5       5      nUc  U R+                  U5      nUcE  Ub  UR-                  5       OSn[.        R0                  " UUUR                  S
   -   UR                  S9nUc  UR3                  S5      nU R5                  U5      nUUR7                  UR                  5      -   nUb"  UR8                  S:  a  UR                  US5      nU R;                  XHX2U5      nU R<                  S:H  =(       a    USL =(       a    USL nU R                   R&                  (       at  U	bq  U	R                  5       u  nnnUU4nU
c  [.        R>                  " UUS9n
U(       a  [A        XRB                  US   S9n
O$U R<                  S:X  d  U RE                  U
5      n
OSn
U RG                  XpR                   RH                  5      nUb  U R+                  U5      nUU-   nU RK                  U5      nSUS
S -   UR                  S5      4-   nU(       a  SOSnU(       a  U R                   R&                  (       a  SOSn U(       a  SOSn![M        U RN                  5       GH  u  n"n#U RP                  (       a  [.        RR                  RU                  UR                  5        Ub  UR7                  UR                  5      n[!        U[.        RV                  5      (       a  UR7                  UR                  5      nU(       a  U!U4-   n!U R                  (       a9  U R                  (       a(  U RY                  U#RZ                  UUUUUU"   U	U
UU5
      n$OU#" U4UUUUU"   U	U
UUS.UD6n$U$S   nU(       a-  UU$S
   4-   nU R                   R&                  (       a	  U U$S   4-   n U RP                  (       d  GMR  U R\                  R_                  5        HO  u  n%n&U"U&S   :X  d  M  S[a        U%5      -   U Rb                  :w  d  M/  UR7                  S[a        U%S
-   5      -   5      nMQ     GM     U Re                  U5      nUR                  U5      nU(       a  U!U4-   n!U(       a  UOSnU(       aE  U R                   R&                  (       a  URf                  Ri                  5       OURi                  5       nU(       d  [k        S UUU!UU 4 5       5      $ [m        UUU!UU S9$ )m  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
NzDYou cannot specify both input_ids and inputs_embeds at the same timer`   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `Cache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r'   re   r	  r   )maskrd   tgt_lenflash_attention_2r:  rX  )r   r   r}   r~   r   r   r  r   r3   r  c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rX  ).0r  s     r]   	<genexpr>$GPT2Model.forward.<locals>.<genexpr>  s      wA ws   	)last_hidden_staterF  r   rk  cross_attentions)7rO   r   r  r  use_return_dictrI   %warn_if_padding_and_no_attention_maskrl   r   rH   re   rz  r   r7   r   r   r   r   from_legacy_cacher  r   r2   get_seq_lengthrK   aranger=  r1   rt   ndim_update_causal_maskr   r   r   rd   invert_attention_maskget_head_maskrU  rr  	enumeraterv  rx  r  
set_devicer   _gradient_checkpointing_func__call__ry  r  r  r  rw  r   to_legacy_cachetupler   )'r   r  rF  r   r}   r  r  r~   r  r   r   r  r   r  r  r   input_shape
batch_sizere   return_legacy_cachepast_seen_tokensposition_embedsr   r   	_use_sdpaencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapetoken_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr|  r  r  r   r  s'                                          r]   r   GPT2Model.forward  sz   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T%+00[_EN&&4==##p "	 $&&*#"..77&*###Y
 #/"@"@"Q{{..z/Sf7g7g"5o|~"V  HHY/M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L((<0%(:(:=;O;O(PP %.*=*=*A+00R@N..>L]
 --7l<MQV<Vl[dhl[l	;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&)L/7J7JT_`bTc*& ..2EE)-)C)CDZ)[&%)" &&y++2E2EF	% $ 8),==M		-0{12.-2D2DR2H1JJ$5b4%64;;;Z;Zr`d"6BD!$&&)HAu""

%%m&:&:;!-%3%6%6}7K7K%LNi66 )]-A-A BI#$58H$H!**t}};;NN!#"aL)*%  !#2#1#.'l*?+A'&7  $AJM &9WQZM&I#;;22+?71:-+O( """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4a *h 		-0%**<8 1]4D D-6/D ;;22  44DDF$446 
  ':KM`bvw   9+++*1
 	
r   input_tensorc           	      4   U R                   R                  S:X  a  Ub  SU;   a  U$ g Ub  UR                  5       OSn[        U[        5      nU R                   R                  S:X  a5  U(       d.  U(       d'  [
        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R                  5      (       a  UR                  S   OXi-   S-   n
U R                  UU	U
UUUR                  S   S9nU R                   R                  S:X  aZ  UbW  UR                  R                  S	:X  a=  U(       d6  [        R                   " U5      R"                  n[
        R$                  " X5      nU$ )
Nr  r   r   r   )r  past_key_values_lengthis_trainingr'   r`   )sequence_lengthtarget_lengthrd   r   r  r  )rO   r   r  r   r   r   _ignore_causal_mask_sdpar   rd   rH   get_max_cache_shaperK   r   5_prepare_4d_causal_attention_mask_with_cache_positionre   rw   rq   rr   _unmask_unattended)r   r}   r  r   rF  r   r  using_static_cacherd   r  r  r   	min_dtypes                r]   r  GPT2Model._update_causal_mask  s    ;;++/BB)c^.C%%
 @O?Z?99;`a'E ;;++v5>PYj%>>*'7 MM	 ""&,,Q/+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**f4%
 E*..I0CCK[Kr   r  r  rd   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU br  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr	  )
fill_valuerd   re   r'   )diagonalr  r`   r   )rg   rK   rq   rr   rk   re   triur  r   r>  clonerH   masked_fill)r}   r  r  rd   r   r  r   r   r  mask_lengthpadding_masks              r]   r  ?GPT2Model._prepare_4d_causal_attention_mask_with_cache_position"  sd   < %.*<*<*>!*C(K& # E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdd+q05@Aq,;,AV5W5c5c 6Aq!\k\12 r   )r   ry  rr  r   r  rz  rv  r  rw  rx  r1   r2   r   )NNNNNNNNNNNNNN)!r   r   r   r   !_supports_param_buffer_assignmentr   r!   PARALLELIZE_DOCSTRINGr  DEPARALLELIZE_DOCSTRINGr  r  r  r  r"   r   rK   r   r   r   r   r   r   r   r   r   r  staticmethodrG   rd   r  r   r   r   s   @r]   rn  rn    sj   (-%* /03 134 12! 3! "2  15NR596:593715598<>B$(,0/3&*Y
E,,-Y
 "%eELL.A(BE(I"JKY
 !!1!12	Y

 !!2!23Y
 !!1!12Y
 u//0Y
 E--.Y
   1 12Y
  (5Y
 !)):): ;Y
 D>Y
 $D>Y
 'tnY
 d^Y
" 
u??	@#Y
 Y
v>> ll> 	>
 >  >@ 222 2 {{	2
 2 2 2r   rn  z
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $       8  ^  \ rS rSrS/rU 4S jr\" \5      SS j5       r\" \	5      S 5       r
S rS r\               SS\\R                      S	\\\\R$                           S
\\R                      S\\R&                     S\\R                      S\\R                      S\\R&                     S\\R&                     S\\R$                     S\\R&                     S\\R                      S\\   S\\   S\\   S\\   S\\\4   4 S jj5       rSrU =r$ )GPT2LMHeadModeliX  lm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        SU l	        S U l
        U R                  5         g NFr0   )r   r   rn  rE  r   r/  n_embdrp  lm_headrx  ry  r{  r   rO   r   s     r]   r   GPT2LMHeadModel.__init__a  sX     $V,yy0A0AN $ 	r   c                    [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        U R                  R                  U R
                  R                   5      U l        SU l        g )NaT  `GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0': 0, 'transformer.h.1': 1, ...}T)r  r  r  r&   rF   rE  rv  rt  rK   r  r  ry  r%   r  r  rt   r  rx  r   ry  s     r]   r  GPT2LMHeadModel.parallelizem  s    - 	
 ! 3t//112E%**:Q:Q:S4TU 	
 	$//3t/?/?/A/A+BC$$T__5||t'7'7'D'DE"r   c                 8   [         R                  " S[        5        U R                  R	                  5         U R                  R                  S5      U l        U R                  R                  S5      U l        SU l        [        R                  R                  5         g Nr  r  F)r  r  r  rE  r  rt   r  rx  rK   r  r  r  s    r]   r  GPT2LMHeadModel.deparallelize  sm    j	
 	&&(++..u5||u-#

 r   c                     U R                   $ r   r  r  s    r]   get_output_embeddings%GPT2LMHeadModel.get_output_embeddings      ||r   c                     Xl         g r   r  r  s     r]   set_output_embeddings%GPT2LMHeadModel.set_output_embeddings      %r   r  rF  r   r}   r  r  r~   r  r   r   labelsr  r   r  r  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
UUUUS9nUS   nU R                  (       ab  [        R
                  R                  U R                  R                  5        UR                  U R                  R                  R                  5      nU R                  U5      nSnUb*  U R                  " UU4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                   UR"                  UR$                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)rF  r}   r   r  r  r~   r  r   r   r  r   r  r  r   rp  r'   )rg  ri  rF  r   rk  r  )rO   r  rE  rx  rK   r  r  r  rt   r  r.   re   loss_functionrp  r   rF  r   rk  r  )r   r  rF  r   r}   r  r  r~   r  r   r   r	  r  r   r  r  r   transformer_outputsr   	lm_logitsrg  r@  s                         r]   r   GPT2LMHeadModel.forward  sd   L &1%<k$++B]B]"..+)))%'"7#9/!5# / 
  ,A. JJ!!$"2"2"?"?@),,T\\-@-@-G-GHMLL/	%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r   )ry  r  rx  rE  r   NNNNNNNNNNNNNNN)r   r   r   r   _tied_weights_keysr   r!   r  r  r  r  r  r  r"   r   rK   r   r   r   r   r   r   r   r   r   r   r   s   @r]   r  r  X  s    ++
 /0# 1#$ 12	! 3	!&  15@D596:593715598<>B-1$(,0/3&*!U
E,,-U
 "%ell(;"<=U
 !!1!12	U

 !!2!23U
 !!1!12U
 u//0U
 E--.U
   1 12U
  (5U
 !)):): ;U
 ))*U
 D>U
 $D>U
 'tnU
  d^!U
$ 
u77	8%U
 U
r   r  a  
        The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
    input embeddings, the classification head takes as input the input of a specified classification token index in the
    input sequence).
    c            $         ^  \ rS rSrS/rU 4S jr\" \5      SS j5       r\" \	5      S 5       r
S rS r\               SS\\R                      S	\\\\R$                           S
\\R                      S\\R&                     S\\R                      S\\R                      S\\R&                     S\\R&                     S\\R                      S\\R                      S\\R                      S\\   S\\   S\\   S\\   S\\\4   4 S jj5       r\S	\\\R$                        S\R$                  S\\\R$                        4S j5       rSrU =r$ )GPT2DoubleHeadsModeli  r  c                   > [         TU ]  U5        SUl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [        U5      U l        SU l        S U l        U R                  5         g )Nr'   Fr  )r   r   r.  rn  rE  r   r/  r  rp  r  r"  multiple_choice_headrx  ry  r{  r  s     r]   r   GPT2DoubleHeadsModel.__init__  sm     $V,yy0A0AN$7$?! $ 	r   c                 x   [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        U R                  R                  U R
                  R                   5      U l        U R"                  R                  U R
                  R                   5      U l        SU l        g )NaY  `GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0': 0, 'transformer.h.1': 1, ...}T)r  r  r  r&   rF   rE  rv  rt  rK   r  r  ry  r%   r  r  rt   r  r  rx  r  s     r]   r   GPT2DoubleHeadsModel.parallelize  s    A 	
 ! 3t//112E%**:Q:Q:S4TU 	
 	$//3t/?/?/A/A+BC$$T__5||t'7'7'D'DE$($=$=$@$@AQAQA^A^$_!"r   c                 x   [         R                  " S[        5        U R                  R	                  5         U R                  R                  S5      U l        U R                  R                  S5      U l        U R                  R                  S5      U l        SU l        [        R                  R                  5         g r  )r  r  r  rE  r  rt   r  r  rx  rK   r  r  r  s    r]   r  "GPT2DoubleHeadsModel.deparallelize  s    j	
 	&&(++..u5||u-$($=$=$@$@$G!#

 r   c                     U R                   $ r   r  r  s    r]   r  *GPT2DoubleHeadsModel.get_output_embeddings%  r  r   c                     Xl         g r   r  r  s     r]   r  *GPT2DoubleHeadsModel.set_output_embeddings(  r  r   r  rF  r   r}   r  r  r~   r  mc_token_idsr	  	mc_labelsr  r   r  r  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUUUUUS9nUS   nU R                  (       ab  [        R
                  R                  U R                  R                  5        UR                  U R                  R                  R                  5      nU R                  U5      nU R                  UU	5      R                  S5      nSnUbA  [        5       nU" UR                  SUR!                  S5      5      UR                  S5      5      nSnU
b  U
R                  UR                  5      n
USSS2SS24   R#                  5       nU
SSS24   R#                  5       n[        5       nU" UR                  SUR!                  S5      5      UR                  S5      5      nU(       d  UU4USS -   nUb  U4U-   nUb  U4U-   $ U$ [%        UUUUUR&                  UR(                  UR*                  S9$ )a5
  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
    Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
    1]`.
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
    `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
    where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel

>>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
>>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")

>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
>>> # Update the model embeddings with the new vocabulary size
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer))

>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

>>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
>>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_logits = outputs.logits
>>> mc_logits = outputs.mc_logits
```N)rF  r   r}   r  r  r~   r  r  r   r  r  r   r`   .r'   )rg  rh  ri  rj  rF  r   rk  )rO   r  rE  rx  rK   r  r  r  rt   r  r.   re   r  rA   r
   r   rl   r   rf  rF  r   rk  )r   r  rF  r   r}   r  r  r~   r  r  r	  r  r  r   r  r  r   r  r   r  rj  rh  loss_fctlm_lossshift_logitsshift_labelsr@  s                              r]   r   GPT2DoubleHeadsModel.forward+  s   J &1%<k$++B]B]"..+)))%'/!5# / 
 ,A. JJ!!$"2"2"?"?@),,T\\-@-@-G-GHMLL/	--m\JRRSUV	 ')Hy~~b)..2DEy~~VXGYZGYYy//0F$S#2#q[1<<>L!#qr'?557L')H|00\5F5Fr5JK\M^M^_aMbcG+.A!".EEF"!f,,3,?WJ'KVK)/??-;;*55
 	
r   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectrt   re   )r  
past_stater&  s     r]   r  @GPT2DoubleHeadsModel._reorder_cache.<locals>.<genexpr>.<genexpr>  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nr  )r  r   r&  s     r]   r  6GPT2DoubleHeadsModel._reorder_cache.<locals>.<genexpr>  s'      
-
 j_ijjj-s   "%r-  )rF  r&  s    `r]   _reorder_cache#GPT2DoubleHeadsModel._reorder_cache  s      
-
 
 	
r   )ry  r  rx  r  rE  r   r  )r   r   r   r   r  r   r!   r  r  r  r  r  r  r"   r   rK   r   r   r   r   r   r   rf  r   r  r/  r   r   r   s   @r]   r  r    s-    ++ /0# 1#& 12
! 3
!&  15@D596:5937155937-104$(,0/3&*!y
E,,-y
 "%ell(;"<=y
 !!1!12	y

 !!2!23y
 !!1!12y
 u//0y
 E--.y
   1 12y
 u//0y
 ))*y
 E,,-y
 D>y
 $D>y
 'tny
  d^!y
$ 
u00	1%y
 y
v 
uU\\23
?D||
	uU\\"	#
 
r   r  a  
    The GPT2 Model transformer with a sequence classification head on top (linear layer).

    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\	\	\R                           S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\	\4   4S jj5       rSrU =r$ )GPT2ForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        SU l	        S U l
        U R                  5         g r  )r   r   r.  rn  rE  r   r/  r  scorerx  ry  r{  r  s     r]   r   &GPT2ForSequenceClassification.__init__  sc      ++$V,YYv}}dooEJ
 $ 	r   r  rF  r}   r  r  r~   r  r	  r  r   r  r  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " UUR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N
rF  r}   r  r  r~   r  r  r   r  r  r   r3   r'   z=Cannot handle batch sizes > 1 if no padding token is defined.r`   )re   rd   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classification)rg  ri  rF  r   rk  )rO   r  rE  r4  rH   pad_token_idrI   rt   re   rK   int32r  argmaxr7   r   r   r   problem_typer.  rd   r<  rG   r   rA   r
   r   r	   r   rF  r   rk  )r   r  rF  r}   r  r  r~   r  r	  r  r   r  r  r  r   ri  r  r  last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrg  r!  r@  s                            r]   r   %GPT2ForSequenceClassification.forward  s   D &1%<k$++B]B]"..+))%'/!5# / 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r   )ry  rx  r.  r4  rE  NNNNNNNNNNNN)r   r   r   r   r   r"   r   rK   r   r   r   r   r   r   r   r   r   r   r   s   @r]   r2  r2    sL     15@D6:59371559-1$(,0/3&*j
E,,-j
 "%ell(;"<=j
 !!2!23	j

 !!1!12j
 u//0j
 E--.j
   1 12j
 ))*j
 D>j
 $D>j
 'tnj
 d^j
 
u66	7j
 j
r   r2  c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\	\	\R                           S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\	\4   4S jj5       rSrU =r$ )GPT2ForTokenClassificationi@  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        SU l        S U l        U R!                  5         g )Nclassifier_dropouthidden_dropoutg?F)r   r   r.  rn  rE  r   rH  rI  r   r   r   r/  r   
classifierrx  ry  r{  )r   rO   rH  r   s      r]   r   #GPT2ForTokenClassification.__init__B  s      ++$V,6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ $ 	r   r  rF  r}   r  r  r~   r  r	  r  r   r  r  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nU R	                  U5      nSnUbW  UR                  UR                  5      n[        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr7  r   r`   r3   )rg  ri  r   rk  )rO   r  rE  r   rJ  rt   re   r
   r   r.  r   r   rk  )r   r  rF  r}   r  r  r~   r  r	  r  r   r  r  r  r   ri  rg  r!  r@  s                      r]   r   "GPT2ForTokenClassification.forwardW  s   D &1%<k$++B]B]"..+))%'/!5# / 
 ,A.]3/YYv}}-F')HFKKDOO<fkk"oNDY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r   )rJ  ry  r   rx  r.  rE  rD  )r   r   r   r   r   r"   r   rK   r   r   r   r   r   r   r   r   r   r   r   s   @r]   rF  rF  @  sL   *  15@D6:59371559-1$(,0/3&*D
E,,-D
 "%ell(;"<=D
 !!2!23	D

 !!1!12D
 u//0D
 E--.D
   1 12D
 ))*D
 D>D
 $D>D
 'tnD
 d^D
 
u++	,D
 D
r   rF  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )GPT2ForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  S5      U l        SU l	        S U l
        U R                  5         g )Nr3   F)r   r   r.  rn  rE  r   r/  r   
qa_outputsrx  ry  r{  r  s     r]   r   !GPT2ForQuestionAnswering.__init__  s_      ++$V,))F$6$6: $ 	r   r  r}   r  r  r~   r  start_positionsend_positionsr   r  r  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a*  UR                  S5      R                  UR                  5      n[        UR                  5       5      S:  a*  UR                  S5      R                  UR                  5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S	9$ )
r  N)r}   r  r  r~   r  r   r  r  r   r'   r`   rf   )ignore_indexr3   )rg  start_logits
end_logitsr   rk  )rO   r  rE  rQ  rC   rA   r   rF   rl   rt   re   clampr
   r   r   rk  )r   r  r}   r  r  r~   r  rS  rT  r   r  r  r  sequence_outputri  rW  rX  
total_lossignored_indexr!  
start_lossend_lossr@  s                          r]   r    GPT2ForQuestionAnswering.forward  s   : &1%<k$++B]B]""))%'/!5# # 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )ry  rx  r.  rQ  rE  )NNNNNNNNNNN)r   r   r   r   r   r"   r   rK   r   r   r   r   r   r   r   r   r   r   s   @r]   rO  rO    s2     156:593715596:48,0/3&*L
E,,-L
 !!2!23L
 !!1!12	L

 u//0L
 E--.L
   1 12L
 "%"2"23L
   0 01L
 $D>L
 'tnL
 d^L
 
u22	3L
 L
r   rO  )r  rO  r2  rF  r  rn  rD  r^   r   )MrB  rS  r9   r  dataclassesr   typingr   r   r   r   rK   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   r   cache_utilsr   r   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr    r!   r"   r#   utils.deprecationr$   utils.model_parallel_utilsr%   r&   configuration_gpt2r(   
get_loggerr   r7   r^   r   Moduler   r   r  r"  rD  rf  r  r  rn  r  r  r2  rF  rO  __all__rX  r   r]   <module>rr     s    "  	  ! 3 3    A A 1 P P ) c  G Y Y  1 K * 
		H	%5p(%VD)BII D)Nbii "N		 Nd`")) `F *s/ *s *sZ &: &: &:R @ ( `# ` `F J
)? J
J
Z @
. @
@
F y
$7 y
y
x [
!4 [
 [
| [
2 [
 [
|	r   