
    fTh)                        S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
JrJr  SSKrSSKJr  SSKJrJrJr  SSKJrJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(  \&RR                  " \*5      r+S r,\RZ                  " 5       \\\S.r. " S S\R^                  5      r0 " S S\R^                  5      r1 " S S\R^                  5      r2 " S S\R^                  5      r3\% " S S\5      5       r4\ " S S\$5      5       r5\% " S S\45      5       r6\%" S S!9 " S" S#\4\5      5       r7\%" S$S!9 " S% S&\45      5       r8\%" S'S!9 " S( S)\45      5       r9/ S*Qr:g)+zPyTorch OpenAI GPT model.    N)	dataclass)AnyCallableDictOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu_newget_activationsilu)GenerationMixin)BaseModelOutputCausalLMOutputSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)ModelOutputauto_docstringlogging   )OpenAIGPTConfigc           	      	   SSK nSSKnSU;   a  [        R                  R	                  U5      n[
        R                  SU 35        [        US-   SSS9 n[        R                  " U5      nSSS5        [        US	-   SSS9 n[        R                  " U5      nSSS5        UR                  W V	s/ s H  oR                  U	5      PM     sn	5      n
[        S
5       Vs/ s H  oR                  USU S3-   5      PM     nnUR                  UR                  US5      U
5      SS n[        X5       VV	s/ s H  u  pUR!                  U	5      PM     nnn	U Vs/ s H  oR#                  5       PM     nnU R$                  R&                  R(                  US   R(                  :w  a<  [+        SU R$                  R&                  R(                   SUS   R(                   35      eU R,                  R&                  R(                  US   R(                  :w  a<  [+        SU R,                  R&                  R(                   SUS   R(                   35      e[.        R0                  " US   5      U R$                  R&                  l        [.        R0                  " US   5      U R,                  R&                  l        WR5                  S5        UR5                  S5        UR5                  S5        [        Xl5       GHZ  u  nnUSS nUSS S:w  a  [+        SU S35      eUSS nUR                  S5      nU nU H  nUR7                  SU5      (       a  UR                  SU5      nOU/nUS   S:X  a  [9        US5      nO;US   S:X  a  [9        US5      nO%US   S:X  a  [9        US5      nO[9        UUS   5      n[;        U5      S :  d  M  [=        US   5      nUU   nM     UR(                  UR(                  :w  a&  [+        S!UR(                   S"UR(                   S#35      e[
        R                  S$U 35        [.        R0                  " U5      Ul        GM]     U $ ! , (       d  f       GN= f! , (       d  f       GN= fs  sn	f s  snf s  sn	nf s  snf )%zGLoad tf pre-trained weights in a pytorch model (from NumPy arrays here)r   Nz.ckptzLoading weights from z/parameters_names.jsonrzutf-8)encodingz/params_shapes.json
   z/params_z.npyr   ztokens_embed.weight.shape: z% does not match init_param[1].shape: zpositions_embed.weight.shape: z% does not match init_param[0].shape:    z:0zLayer z does not end with :0/z[A-Za-z]+\d+z(\d+)gweightbbiasw   zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpyospathdirnameloggerinfoopenjsonloadcumsumprodrangesplitconcatenatezipreshapesqueezetokens_embedr(   shape
ValueErrorpositions_embedtorch
from_numpydatapop	fullmatchgetattrlenint)modelconfigopenai_checkpoint_folder_pathr-   npnames_handlenamesshapes_handleshapesr@   offsetsninit_paramsparamarrnamearraypointerm_namescope_namesnums                        b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/openai/modeling_openai.pyload_tf_weights_in_openai_gptr_   ,   s   //(*8U(V%
KK'(E'FGH	+.FFV]	^bn		,' 
_	+.CCSSZ	[_l=) 
\iiV<VEV<=GX]^`XabXaST778XaS;MMNXaKb((2>>+q97CCRHK<?<TU<TLE5=='<TKU
 -88KS;;=KK8   &&+a.*>*>>)%*<*<*C*C*I*I)J KA$$%'
 	

 ##))[^-A-AA,U-B-B-I-I-O-O,P QA$$%'
 	

 &+%5%5k!n%EE"(-(8(8Q(HE  %	IIaLOOAOOA5.eABx9vdV+@ABBCRyzz#F||OV44 hhx8%h1~$!'84Q3&!'62Q3&!'84!';q>:;1$+a.)!#, $ ==EKK'~gmm_<Mekk]Zefgg078''.; /< LA 
_	^	[	[<bU
 9s0   QQ!/Q3!Q83Q=R
Q!
Q0)relur   geluswishc                   V   ^  \ rS rSrS	U 4S jjrS rS
S jrS rS	S jrS
S jr	Sr
U =r$ )	Attention}   c           	      H  > [         TU ]  5         UnXSR                  -  S:w  a  [        SU SUR                   35      eU R	                  S[
        R                  " [
        R                  " X"5      5      R                  SSX"5      SS9  UR                  U l        XPl	        X@l
        [        US-  U5      U l        [        XQ5      U l        [        R                  " UR                   5      U l        [        R                  " UR$                  5      U l        [)        5       U l        g )	Nr   zAttention n_state shape: z$ must be divisible by config.n_head r*   r   F
persistentr   )super__init__n_headrA   register_bufferrC   trilonesview
split_sizescaler   c_attnc_projr
   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)selfnxn_positionsrL   rq   n_state	__class__s         r^   rj   Attention.__init__~   s    ]]"a'8	AeflfsfsetuvvJJuzz+;<AA!Qa 	 	

 mm!
Wq["-W)JJv'8'89ZZ(:(:;E    c                 R   [        U5      S:X  a  g [        XR                  U R                  U R                  -  U R                  5      u  p[
        R                  " X"U R                  -   USU R                  -  -   /5      n[        U R                  USS9U l        [        U R                  USS9U l	        U R                  U R                  -  U R                  [        U5      -
  -  U l        U R                  [        U5      -
  U l        U R                  R                  U5      U l        g )Nr   r,   r   dim)rI   r   rk   rp   rz   rC   catr   rr   rs   union)r{   headsindex
index_attns       r^   prune_headsAttention.prune_heads   s    u:?7;;4;; >@Q@Q
 YYt'>T__I\@]^_
(jaH(eC??dkk9dkkCPUJ>VWkkCJ. --33E:r   c                    [         R                  " X5      nU R                  (       a(  U[        R                  " UR                  S5      5      -  nU R                  S S 2S S 2S UR                  S5      2S UR                  S5      24   nXx-  SSU-
  -  -   nUb  Xt-   n[        R                  R                  USS9nU R                  U5      nUb  Xu-  n[         R                  " Xs5      /n	U(       a  U	R                  U5        U	$ )Nr#   r%   g     r   r   )rC   matmulrq   mathsqrtsizer*   r
   
functionalsoftmaxrv   append)
r{   qkvattention_mask	head_maskoutput_attentionsr+   r)   outputss
             r^   _attnAttention._attn   s    LL::DIIaffRj))A IIaLaffRjL,AFF2J,67EDAEN"%"AMM!!!!,a   A<<%&NN1r   c                     UR                  SSSS5      R                  5       nUR                  5       S S UR                  S5      UR                  S5      -  4-   nUR                  " U6 $ )Nr   r,   r   r   r%   r#   )permute
contiguousr   ro   )r{   xnew_x_shapes      r^   merge_headsAttention.merge_heads   s\    IIaAq!,,.ffhsmqvvbzAFF2J'>&@@vv{##r   c                     UR                  5       S S U R                  UR                  S5      U R                  -  4-   nUR                  " U6 nU(       a  UR                  SSSS5      $ UR                  SSSS5      $ )Nr#   r   r,   r   r   )r   rk   ro   r   )r{   r   r   r   s       r^   split_headsAttention.split_heads   sm    ffhsmt{{AFF2J$++4M&NNFFK 99Q1a((99Q1a((r   c                 n   U R                  U5      nUR                  U R                  SS9u  pVnU R                  U5      nU R                  USS9nU R                  U5      nU R	                  XVXrX45      nUS   n	U R                  U	5      n	U R                  U	5      n	U R                  U	5      n	U	/USS  -   n
U
$ )Nr,   r   T)r   r   r   )rr   r:   rp   r   r   r   rs   rx   )r{   r   r   r   r   querykeyvalueattn_outputsar   s              r^   forwardAttention.forward   s    KKNGGDOOG;E  'sd+  'zz%eYbOQKKNq!#QR((r   )rv   rr   rs   rk   rz   rx   rq   rp   FNNF)__name__
__module____qualname____firstlineno__rj   r   r   r   r   r   __static_attributes____classcell__r   s   @r^   rd   rd   }   s&    "*;2$
) r   rd   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )MLP   c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g N)ri   rj   n_embdr   c_fcrs   ACT_FNSafnactr
   rt   rw   dropout)r{   r~   rL   r|   r   s       r^   rj   MLP.__init__   sU    ]]7'	R)6::&zz&"4"45r   c                     U R                  U R                  U5      5      nU R                  U5      nU R                  U5      $ r   r   r   rs   r   )r{   r   hh2s       r^   r   MLP.forward   s4    HHTYYq\"[[^||Br   r   r   r   r   r   rj   r   r   r   r   s   @r^   r   r      s    6   r   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Block   c                   > [         TU ]  5         UR                  n[        XAX#5      U l        [
        R                  " XBR                  S9U l        [        SU-  U5      U l
        [
        R                  " XBR                  S9U l        g )N)eps   )ri   rj   r   rd   attnr
   	LayerNormlayer_norm_epsilonln_1r   mlpln_2)r{   r}   rL   rq   r|   r   s        r^   rj   Block.__init__   sc    ]]bv=	LL)B)BC	q2vv&LL)B)BC	r   c                     U R                  UUUUS9nUS   nU R                  X-   5      nU R                  U5      nU R                  Xx-   5      n	U	/USS  -   n
U
$ )N)r   r   r   r   r   )r   r   r   r   )r{   r   r   r   r   r   r   rT   mr   r   s              r^   r   Block.forward   sq    yy)/	 ! 
 OIIaeHHQKIIae#QR((r   )r   r   r   r   r   r   r   r   s   @r^   r   r      s    D r   r   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\R                  4S jjrS	rU =r$ )OpenAIGPTSequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`OpenAIGPTConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
rL   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastr   summary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)ri   rj   rH   r   NotImplementedErrorr
   Identitysummaryhasattrr   r   
num_labelshidden_sizeLinearr   
activationfirst_dropoutr   rt   last_dropoutr   )r{   rL   num_classesactivation_stringr   s       r^   rj   !OpenAIGPTSequenceSummary.__init__  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2r   hidden_states	cls_indexreturnc                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
r   Nr#   firstr   meanr   r   r   .r%   dtype)r#   r   )r   r   rC   	full_liker@   long	unsqueezeexpandr   r   gatherr>   r   r   r   r   r   )r{   r   r   outputs       r^   r    OpenAIGPTSequenceSummary.forward;  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*r   )r   r   r   r   r   r   )r   r   r   r   __doc__r   rj   rC   FloatTensorr   
LongTensorr   r   r   r   s   @r^   r   r     sV    2H H< Y])"..);CEDTDT;U)			) )r   r   c                   &    \ rS rSr\r\rSrS r	Sr
g)OpenAIGPTPreTrainedModelig  transformerc                 $   [        U[        R                  [        45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weights.g        )r   stdN      ?)
isinstancer
   r   r   r(   rE   normal_rL   initializer_ranger*   zero_	Embeddingpadding_idxr   fill_)r{   modules     r^   _init_weights&OpenAIGPTPreTrainedModel._init_weightsm  s   fryy&122 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r    N)r   r   r   r   r   config_classr_   load_tf_weightsbase_model_prefixr  r   r  r   r^   r  r  g  s    "L3O%*r   r  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)OpenAIGPTDoubleHeadsModelOutputi~  aJ  
Base class for outputs of models predicting if two sentences are consecutive or not.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
        Multiple choice classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossmc_losslogits	mc_logitsr   
attentionsr  )r   r   r   r   r   r  r   rC   r  __annotations__r  r  r  r   r   r  r   r  r   r^   r  r  ~  s    2 )-D(5$$
%,+/GXe''(/*.FHU&&'.-1Ix))*18<M8E%"3"345<59Ju00129r   r  c                   ^  ^  \ rS rSrU 4S jrS rS rS r\         SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSrU =r$ )OpenAIGPTModeli  c                 Z  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        UR                  USS9PM     sn5      U l        U R#                  S[$        R&                  " UR                  5      SS9  U R)                  5         g s  snf )NT)rq   position_idsFrg   )ri   rj   r
   r  
vocab_sizer   r?   r}   rB   rt   
embd_pdropdrop
ModuleListr9   n_layerr   r   rl   rC   arange	post_init)r{   rL   _r   s      r^   rj   OpenAIGPTModel.__init__  s     LL):):FMMJ!||F,>,>NJJv001	W\]c]k]kWlmWlRSf&8&8& MWlmn^U\\&:L:L-MZ_`	  ns   =D(c                     U R                   $ r   r?   r{   s    r^   get_input_embeddings#OpenAIGPTModel.get_input_embeddings  s       r   c                     Xl         g r   r-  r{   new_embeddingss     r^   set_input_embeddings#OpenAIGPTModel.set_input_embeddings  s    *r   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)itemsr   r   r   )r{   heads_to_prunelayerr   s       r^   _prune_headsOpenAIGPTModel._prune_heads  s5     +002LEFF5M**51 3r   	input_idsr   token_type_idsr"  r   inputs_embedsr   output_hidden_statesreturn_dictr   c
                 `   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb7  U R                  X5        UR                  5       n
UR                  SU
S   5      nO"Ub  UR                  5       S S n
O[	        S5      eUc  U R                  S S U
S   24   nUb  UR                  S5      R                  S5      nUR                  [        U R                  5       5      R                  S9nSU-
  [        R                  " U R                  5      R                   -  nU R#                  XPR                   R$                  5      nUc  U R'                  U5      nU R)                  U5      nUb3  UR                  SUR                  S5      5      nU R'                  U5      nOSnXk-   U-   nU R+                  U5      nXR                  S5      4-   nU(       a  S	OS nU(       a  S	OS n[-        U R.                  5       H5  u  nnU(       a  UU4-   nU" XUU   US
9nUS   nU(       d  M,  UUS   4-   nM7     UR                  " U6 nU(       a  UU4-   nU	(       d  [1        S UUU4 5       5      $ [3        UUUS9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embedsr   r,   r   r  r   r  )r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  ).0r   s     r^   	<genexpr>)OpenAIGPTModel.forward.<locals>.<genexpr>  s     h$Vq$Vs   	)last_hidden_stater   r  )rL   r   r?  use_return_dictrA   %warn_if_padding_and_no_attention_maskr   ro   r"  r   tonext
parametersr   rC   finfominget_head_maskr'  r?   rB   r%  	enumerater   tupler   )r{   r<  r   r=  r"  r   r>  r   r?  r@  input_shapeposition_embedstoken_type_embedsr   output_shapeall_attentionsall_hidden_statesiblockr   s                       r^   r   OpenAIGPTModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I&',,.s3KTUU,,T3D[_3D-DEL % ,55a8BB1EN ,..T$//:K5L5R5R.SN!N2ekk$**6M6Q6QQN &&y++2E2EF	  --i8M..|<%+00^5H5H5LMN $ 1 1. A !%7:KK		-0"&8&8&<%>>0d"6BD!$&&)HAu#$58H$H!M9Q<[lmG#AJM  !/71:-!? * &**L9 1]4D Dh]4E~$Vhhh++%
 	
r   )r%  r   rB   r?   )	NNNNNNNNN)r   r   r   r   rj   r/  r4  r:  r   r   rC   r  r  boolr	   r   Tensorr   r   r   r   r   s   @r^   r   r     s   
!+2  156:59371559,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 E--.Y
   1 12Y
 $D>Y
 'tnY
 d^Y
 
uU\\"O3	4Y
 Y
r   r   z
    OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrS/rU 4S jrS rS r\          SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rS\
R                  S\\\4   4S jrSrU =r$ )OpenAIGPTLMHeadModeli  lm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr*   )
ri   rj   r   r  r
   r   r   r#  lm_headr)  r{   rL   r   s     r^   rj   OpenAIGPTLMHeadModel.__init__"  sG     )&1yy0A0AN 	r   c                     U R                   $ r   rc  r.  s    r^   get_output_embeddings*OpenAIGPTLMHeadModel.get_output_embeddings*      ||r   c                     Xl         g r   rg  r2  s     r^   set_output_embeddings*OpenAIGPTLMHeadModel.set_output_embeddings-      %r   r<  r   r=  r"  r   r>  labelsr   r?  r@  r   c                 f   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Nr   r=  r"  r   r>  r   r?  r@  r   r#  r   r  r  r   r  )	rL   rG  r  rc  loss_functionr#  r   r   r  )r{   r<  r   r=  r"  r   r>  ro  r   r?  r@  kwargstransformer_outputsr   	lm_logitsr  r   s                    r^   r   OpenAIGPTLMHeadModel.forward0  s    * &1%<k$++B]B]"..))%'/!5# / 

 ,A.LL/	%%  ;;11 	D \$7$;;F)-)9TGf$EvE-;;*55	
 	
r   c                 
    SU0$ )Nr<  r  )r{   r<  rt  s      r^   prepare_inputs_for_generation2OpenAIGPTLMHeadModel.prepare_inputs_for_generationj  s    Y''r   )rc  r  
NNNNNNNNNN)r   r   r   r   _tied_weights_keysrj   rh  rl  r   r   rC   r  r  rZ  r	   r   r[  r   r   r   strr   ry  r   r   r   s   @r^   r^  r^    sU    ++&  156:59371559-1,0/3&*7
E,,-7
 !!2!237
 !!1!12	7

 u//07
 E--.7
   1 127
 ))*7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
r(u7G7G (VZ[^`c[cVd ( (r   r^  a  
        OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
    input embeddings, the classification head takes as input the input of a specified classification token index in the
    input sequence).
    c                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSrU =r$ )OpenAIGPTDoubleHeadsModelio  r_  c                    > [         TU ]  U5        SUl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [        U5      U l        U R                  5         g )Nr   Frb  )ri   rj   r   r   r  r
   r   r   r#  rc  r   multiple_choice_headr)  rd  s     r^   rj   "OpenAIGPTDoubleHeadsModel.__init__z  s\     )&1yy0A0AN$<V$D! 	r   c                     U R                   $ r   rg  r.  s    r^   rh  /OpenAIGPTDoubleHeadsModel.get_output_embeddings  rj  r   c                     Xl         g r   rg  r2  s     r^   rl  /OpenAIGPTDoubleHeadsModel.set_output_embeddings  rn  r   r<  r   r=  r"  r   r>  mc_token_idsro  	mc_labelsr   r?  r@  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU
UUS9	nUS   nU R                  U5      nU R	                  X5      R                  S5      nSu  nnU	bA  [        5       nU" UR                  SUR                  S5      5      U	R                  S5      5      nUbr  USSS2SS24   R                  5       nUSSS24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  UU4USS -   nUb  U4U-   nUb  U4U-   $ U$ [        UUUUUR                  UR                  S9$ )	a  
mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
    Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
    1]`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-1, 0, ..., config.vocab_size]` All labels set to `-100` are
    ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
    where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)

Examples:

```python
>>> from transformers import AutoTokenizer, OpenAIGPTDoubleHeadsModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
>>> model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-community/openai-gpt")
>>> tokenizer.add_special_tokens(
...     {"cls_token": "[CLS]"}
... )  # Add a [CLS] to the vocabulary (we should train it also!)
>>> model.resize_token_embeddings(len(tokenizer))

>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
>>> mc_token_ids = torch.tensor([input_ids.size(-1) - 1, input_ids.size(-1) - 1]).unsqueeze(0)  # Batch size 1

>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_logits = outputs.logits
>>> mc_logits = outputs.mc_logits
```Nrq  r   r#   )NN.r   )r  r  r  r  r   r  )rL   rG  r  rc  r  r>   r   ro   r   r   r  r   r  )r{   r<  r   r=  r"  r   r>  r  ro  r  r   r?  r@  ru  r   rv  r  lm_lossr  loss_fctshift_logitsshift_labelsr   s                          r^   r   !OpenAIGPTDoubleHeadsModel.forward  s   b &1%<k$++B]B]"..))%'/!5# / 

 ,A.LL/	--mJRRSUV	% ')Hy~~b)..2DEy~~VXGYZG$S#2#q[1<<>L!#qr'?557L')H|00\5F5Fr5JK\M^M^_aMbcG+.A!".EEF"!f,,3,?WJ'KVK.-;;*55
 	
r   )rc  r  r  )NNNNNNNNNNNN)r   r   r   r   r|  rj   rh  rl  r   r   rC   r  r  rZ  r	   r   r[  r  r   r   r   r   s   @r^   r  r  o  si    ++	&  156:5937155937-104,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 E--.Y
   1 12Y
 u//0Y
 ))*Y
 E,,-Y
 $D>Y
 'tnY
 d^Y
 
uU\\"$CC	DY
 Y
r   r  a  
    The Original OpenAI GPT Model transformer with a sequence classification head on top (linear layer).
    [`OpenAIGPTForSequenceClassification`] uses the last token in order to do the classification, as other causal
    models (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the
    last token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding
    token in each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since
    it cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take
    the last value in each row of the batch).
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\R                     \4   4S jj5       rSrU =r$ )"OpenAIGPTForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g ra  )
ri   rj   r   r   r  r
   r   r   scorer)  rd  s     r^   rj   +OpenAIGPTForSequenceClassification.__init__  sR      ++)&1YYv}}dooEJ
 	r   r<  r   r=  r"  r   r>  ro  r   r?  r@  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  SS u  pOUR                  SS u  pU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " XR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrq  r   r,   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r#   )devicer   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r  
regressionsingle_label_classificationmulti_label_classificationrr  )rL   rG  r  r  r@   pad_token_idrA   rI  r  rC   int32r(  argmaxr2   warning_oncer   r   problem_typer   r   r   rJ   r   r>   r   ro   r   r   r   r  )r{   r<  r   r=  r"  r   r>  ro  r   r?  r@  ru  r   r  
batch_sizesequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  r  r   s                          r^   r   *OpenAIGPTForSequenceClassification.forward  s   ( &1%<k$++B]B]"..))%'/!5# / 

 ,A.M* *3//"1*='J*7*=*=bq*A'J ;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r   )r   r  r  r{  )r   r   r   r   rj   r   r   rC   r  r  rZ  r	   r   r[  r   r   r   r   r   s   @r^   r  r    s"     156:59371559-1,0/3&*[
E,,-[
 !!2!23[
 !!1!12	[

 u//0[
 E--.[
   1 12[
 ))*[
 $D>[
 'tn[
 d^[
 
uU\\"$<<	=[
 [
r   r  )r  r  r^  r   r  r_   );r   r5   r   r/   dataclassesr   typingr   r   r   r   r   r	   rC   r
   torch.nnr   r   r   activationsr   r   r   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_openair   
get_loggerr   r2   r_   ReLUr   Modulerd   r   r   r   r  r  r   r^  r  r  __all__r  r   r^   <module>r     s        	 ! > >   A A 9 9 ) Y Y - Y Y 
 2 
		H	%K\ 779dHt
LZ		 Zz "))  BII 6`ryy `F * * *, :k : :D t
- t
 t
n M(3_ M(M(` n
 8 n
n
b 
f
)A f

f
Rr   