
    fThP~                        S r SSKJrJrJr  SSKrSSKrSSKJr  SSK	J
r
JrJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  \R<                  " \5      r S r!S r"S"S jr# " S S\RH                  5      r%S r& " S S\RH                  5      r'\ " S S\5      5       r(\ " S S\(5      5       r)\" SS9 " S S\(\5      5       r*\" SS9 " S S \(5      5       r+/ S!Qr,g)#zPyTorch CTRL model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 P    S[         R                  " SSUS-  -  U-  5      -  nX-  $ )Nr   i'     )torchpow)posid_model_sizeangle_ratess       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr    '   s-    eiiQ!V'DEEK    c                    [        [        R                  " U [        R                  S9R	                  U5      R                  S5      [        R                  " U[        R                  S9R	                  U5      R                  S5      U5      n[        R                  " US S 2SS S24   5      n[        R                  " US S 2SS S24   5      n[        R                  " XE/SS9nU$ )Ndtyper   r   r   dim)	r    r   arangeint64to	unsqueezesincoscat)positionr   r$   
angle_radssinescosinespos_encodings          r   positional_encodingr4   ,   s    XU[[144U;EEaH\588?II!LJ IIjADqD)*Eii
1add7+,G99e-26Lr!   c           	         [         R                  " XR                  SSSS5      5      nUR                  S   nU[        R
                  " U5      -  nUb3  UR                  S5      UR                  S5      pXX-
  U
2S U
24   S-  -  nUb  X-   n[         R                  " USS9nUb  X-  n[         R                  " X5      nX4$ )	Nr   r   r
   r   r%   g     r&   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	head_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs                r   scaled_dot_product_attentionrK   ;   s    Q		!Q1 56I	
B'"''"+5(--b13J3O3OPR3SB"crc(9#:T#AA!"9"J&=2F -9\\+/F$$r!   c                   H   ^  \ rS rSrU 4S jrS rS r     SS jrSrU =r	$ )MultiHeadAttentionU   c                 h  > [         TU ]  5         X l        Xl        [	        XR                  -  5      U l        [        R                  " X5      U l        [        R                  " X5      U l	        [        R                  " X5      U l
        [        R                  " X5      U l        [        5       U l        g N)super__init__	num_headsr   intdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rS   	__class__s      r   rR   MultiHeadAttention.__init__V   sv    "(67
))L7))L7))L7YY|:
Er!   c                    U R                   U R                  -  n[        U5      S:X  a  g [        XR                  X R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        X R                  -  U l         U R                  R                  U5      U l        g )Nr   r   r&   )r   rS   lenr   r\   r   rW   rX   rY   rZ   union)r]   headsattention_head_sizeindexs       r   prune_headsMultiHeadAttention.prune_headsd   s    "//4>>Au:?7~~Obduduv %TWWe4$TWWe4$TWWe4'

EqA
 #e*4/..@ --33E:r!   c                 x    UR                  USU R                  U R                  5      nUR                  / SQ5      $ )Nr%   r   r   r   r
   )reshaperS   rU   r8   )r]   x
batch_sizes      r   split_into_heads#MultiHeadAttention.split_into_headsu   s-    IIj"dnndjjAyy&&r!   c
                 j   UR                   S   n
U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X:5      nU R	                  X*5      nU R	                  X5      nUb5  US   US   p[
        R                  " X4SS9n[
        R                  " X4SS9nUSL a  [
        R                  " X!45      nOSn[        X2XXg5      nUS   R                  / SQ5      nUS   nUR                  U
SU R                  5      nU R                  U5      nX4nU	(       a  UU4-   nU$ )	Nr   r   r6   r&   TrP   ri   r%   )r9   rW   rX   rY   rm   r   r.   stackrK   r8   rj   r   rZ   )r]   r@   r?   r>   rA   
layer_pastrB   rC   	use_cacheoutput_attentionsrl   past_key
past_valuepresentrJ   scaled_attentionattnoriginal_size_attentionoutputss                      r   forwardMultiHeadAttention.forwardy   s0    WWQZ
GGAJGGAJGGAJ!!!0!!!0!!!0!#-a=*Q-j		8-R0A		:/r2Akk1&)GG-aA^W!!9,,\:ay"2":"::r4K\K\"]34#'Gr!   )rX   rW   rY   r   rZ   rU   rS   r\   NNNFF)
__name__
__module____qualname____firstlineno__rR   rf   rm   r{   __static_attributes____classcell__r^   s   @r   rM   rM   U   s-    ";"' ( (r!   rM   c                     [         R                  " [         R                  " X5      [         R                  " 5       [         R                  " X5      5      $ rP   )r   
SequentialrV   ReLU)r   dffs     r   point_wise_feed_forward_networkr      s-    ==<5rwwy"))CB^__r!   c                   8   ^  \ rS rSrSU 4S jjr SS jrSrU =r$ )EncoderLayer   c                 6  > [         TU ]  5         [        X5      U l        [	        X5      U l        [        R                  " USS9U l        [        R                  " USS9U l	        [        R                  " U5      U l        [        R                  " U5      U l        g )Ngư>eps)rQ   rR   rM   multi_head_attentionr   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)r]   r   rS   r   rater^   s        r   rR   EncoderLayer.__init__   sk    $6|$O!2<E,,|>,,|>

4(

4(r!   c                 
   U R                  U5      nU R                  UUUUUUUUUS9	n	U	S   n
U R                  U
5      n
X-   nU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU4U	SS  -   nU$ )Nrq   rB   rC   rr   rs   r   r   )r   r   r   r   r   r   )r]   rk   rA   rq   rB   rC   rr   rs   normedattn_outputsattn_outputout1out2
ffn_outputrz   s                  r   r{   EncoderLayer.forward   s     #00!)/ 1 

 #1ommK0t$XXd^
]]:.
 'L,,r!   )r   r   r   r   r   r   )g?r}   )r~   r   r   r   rR   r{   r   r   r   s   @r   r   r      s    
) qv r!   r   c                   "    \ rS rSr\rSrS rSrg)CTRLPreTrainedModel   transformerc                 $   [        U[        R                  [        45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weights.g        )meanstdN      ?)
isinstancer   rV   r   weightdatanormal_configinitializer_rangebiaszero_	Embeddingpadding_idxr   fill_)r]   modules     r   _init_weights!CTRLPreTrainedModel._init_weights   s   fryy&122 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r!    N)	r~   r   r   r   r   config_classbase_model_prefixr   r   r   r!   r   r   r      s    L%*r!   r   c                     ^  \ rS rSrU 4S jrS rS rS r\           SS\	\
R                     S\	\\\
R                           S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSrU =r$ )	CTRLModel   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  U R                  [        R                  5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        [        R$                  " ['        UR                  5       Vs/ s H9  n[)        UR                  UR*                  UR,                  UR.                  5      PM;     sn5      U l        [        R2                  " UR                  UR4                  S9U l        U R9                  5         g s  snf )Nr   )rQ   rR   n_embdr   n_layer
num_layersr4   n_positionsr   floatr3   r   r   
vocab_sizewr   
embd_pdropdropout
ModuleListranger   n_headr   resid_pdrophr   layer_norm_epsilon	layernorm	post_init)r]   r   _r^   s      r   rR   CTRLModel.__init__   s     "MM ../0B0BDDUDUW\WbWbcf//?zz&"3"34afgmguguavwav\]\&--

FDVDVWavw
 fmm9R9RS 	 xs   #A E.c                     U R                   $ rP   r   r]   s    r   get_input_embeddingsCTRLModel.get_input_embeddings   s    vvr!   c                     Xl         g rP   r   r]   new_embeddingss     r   set_input_embeddingsCTRLModel.set_input_embeddings   s    r!   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)itemsr   r   rf   )r]   heads_to_prunelayerrc   s       r   _prune_headsCTRLModel._prune_heads  s5     +002LEFF5M..::5A 3r!   	input_idspast_key_valuesrB   token_type_idsposition_idsrC   inputs_embedsrr   rs   output_hidden_statesreturn_dictreturnc                 d   U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nUc%  Sn[        S/[        U R                  5      -  5      nOUS   S   R                  S5      nUc<  [        R                  " UUS   U-   [        R                   US9nUR#                  S5      nUb  US::  a  [        S5      eUR                  US5      nUR#                  S	5      R#                  S
5      nUR%                  U R&                  S9nSU-
  [        R(                  " U R&                  5      R*                  -  nU R-                  X`R                   R.                  5      nUbJ  UR                  SUS   5      nU R1                  U5      nU[2        R4                  " U R6                  5      -  nOSnUc  U R1                  U5      nUS   n[        R8                  " [        R:                  " UU-   UU-   5      S	5      R%                  U5      nU[2        R4                  " U R6                  5      -  nU R<                  R%                  U5      U l        U R<                  USS24   nUU-   U-   nU R?                  U5      nU(       a  SOSnU
(       a  SOSnU	(       a  SOSn[A        [C        U R                  U5      5       HJ  u  nu  nnU
(       a  UU4-   nU" UUUUUU   UU	S9nUSS
 u  nnUSL a  UU4-   nU	(       d  MA  UUS
   4-  nML     U RE                  U5      nU
(       a  UU4-   nU(       d  [        S UUUU4 5       5      $ [G        UUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)

Example:

```python
>>> from transformers import AutoTokenizer, CTRLModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 5, 1280]
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer%   r   z5You have to specify either input_ids or inputs_embedsr6   )r$   devicez$batch_size has to be defined and > 0r   r   r#   r   r   r   Tc              3   .   #    U  H  oc  M  Uv   M     g 7frP   r   ).0r@   s     r   	<genexpr>$CTRLModel.forward.<locals>.<genexpr>  s     r$`q$`s   	)last_hidden_stater   hidden_states
attentions)$r   rs   rr   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr<   viewr9   r   tuplera   r   r   r(   longr+   r*   r$   finfominget_head_maskr   r   r:   r;   r   triuonesr3   r   	enumeratezipr   r   )r]   r   r   rB   r   r   rC   r   rr   rs   r   r   kwargsinput_shaperl   r   past_lengthtoken_type_embedsseq_lenrA   
pos_embedsr   presentsall_hidden_statesall_attentionsr   r   rq   rz   rv   s                                 r   r{   CTRLModel.forward	  s4   ` 2C1N-TXT_T_TqTq!*!6IDKK<Q<Q	$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T"K#TFS[$89O)!,Q/44R8K <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,55a8BB1EN ,..TZZ.@N!N2ekk$**6M6Q6QQN &&y++2E2EF	%+00[_EN $~ 6):):!;; !  FF9-Mb/zz%**W{%:Gk<QRTUVYYZ`a!2!233 !--008&&|Q7
%
25FF]3"2"6BD0d"+C,H"IA:#$58H$H!%-#A,#"3G &-Ra["M7D #wj0  71:-/# #J& }5 1]4D Dr]H>OQ_$`rrr&+$+%	
 	
r!   )r   r   r   r   r   r3   r   )NNNNNNNNNNN)r~   r   r   r   rR   r   r   r   r   r   r   
LongTensorr   FloatTensorboolr   Tensorr   r{   r   r   r   s   @r   r   r      sN   & B  15EI6:59371559$(,0/3&*^
E,,-^
 "%e.?.?(@"AB^
 !!2!23	^

 !!1!12^
 u//0^
 E--.^
   1 12^
 D>^
 $D>^
 'tn^
 d^^
 
uU\\"$;;	<^
 ^
r!   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   4  ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\\\
R                           S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\
R                      \4   4S jj5       rSS jr\S\\\
R                         S\
R                   S\\\
R                         4S j5       rSrU =r$ )CTRLLMHeadModeli  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NTr   )
rQ   rR   r   r   r   rV   r   r   lm_headr   r]   r   r^   s     r   rR   CTRLLMHeadModel.__init__  sG     $V,yy0A0AM 	r!   c                     U R                   $ rP   r  r   s    r   get_output_embeddings%CTRLLMHeadModel.get_output_embeddings  s    ||r!   c                     Xl         g rP   r  r   s     r   set_output_embeddings%CTRLLMHeadModel.set_output_embeddings  s    %r!   r   r   rB   r   r   rC   r   labelsrr   rs   r   r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLLMHeadModel

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> sequence_ids = model.generate(inputs["input_ids"])
>>> sequences = tokenizer.batch_decode(sequence_ids)
>>> sequences
['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> round(outputs.loss.item(), 2)
9.21

>>> list(outputs.logits.shape)
[1, 5, 246534]
```N
r   rB   r   r   rC   r   rr   rs   r   r   r   r   r   )losslogitsr   r   r   )
r   r   r   r  loss_functionr   r   r   r   r   )r]   r   r   rB   r   r   rC   r   r  rr   rs   r   r   r  transformer_outputsr   	lm_logitsr!  rJ   s                      r   r{   CTRLLMHeadModel.forward  s	   v &1%<k$++B]B]"..+))%'/!5# / 
 ,A.LL/	%%  ;;11 	D \$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r!   c                     UbH  US   S   R                   S   nUR                   S   U:  a  UnOUR                   S   S-
  nUS S 2US 24   nXUS.$ )Nr   r   r   )r   r   rr   )r9   )r]   r   r   rr   r  r  remove_prefix_lengths          r   prepare_inputs_for_generation-CTRLLMHeadModel.prepare_inputs_for_generation&  sr     &)!,Q/55a8K q!K/'2$ (1q'9A'=$!!%9%:":;I&Ybccr!   beam_idxc                 .   ^ [        U4S jU  5       5      $ )z
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
c              3   N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr*   r   )r   
past_stater+  s     r   r   ;CTRLLMHeadModel._reorder_cache.<locals>.<genexpr>.<genexpr>B  s1     j_iQ[))!X[[9J9J-KLL_is   7:Nr   )r   rq   r+  s     r   r   1CTRLLMHeadModel._reorder_cache.<locals>.<genexpr>A  s'      
-
 j_ijjj-s   "%r2  )r   r+  s    `r   _reorder_cacheCTRLLMHeadModel._reorder_cache8  s      
-
 
 	
r!   )r  r   NNNNNNNNNNNNNN)r~   r   r   r   _tied_weights_keysrR   r  r  r   r   r   r  r   r  r  r   r  r   r{   r)  staticmethodr4  r   r   r   s   @r   r  r    s    ++&  15EI6:59371559-1$(,0/3&*a
E,,-a
 "%e.?.?(@"ABa
 !!2!23	a

 !!1!12a
 u//0a
 E--.a
   1 12a
 ))*a
 D>a
 $D>a
 'tna
 d^a
 
uU\\"$::	;a
 a
Fd$ 
uU\\23
?D||
	uU\\"	#
 
r!   r  a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\	\	\R                           S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\	\R                     \4   4S jj5       rSrU =r$ )CTRLForSequenceClassificationiG  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr  )
rQ   rR   
num_labelsr   r   r   rV   r   
classifierr   r  s     r   rR   &CTRLForSequenceClassification.__init__S  sR      ++$V,))FMM4??O 	r!   r   r   rB   r   r   rC   r   r  rr   rs   r   r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " UUR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  S9$ )a"  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> import torch

>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> labels = torch.tensor(1)
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2)
0.93
```

Example of multi-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained(
...     "Salesforce/ctrl", problem_type="multi_label_classification"
... )

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> num_labels = len(model.config.id2label)
>>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
...     torch.float
... )
>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()  # doctest: +IGNORE_RESULT
```Nr   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r%   )r   r$   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r!  r"  r   r   )r   r   r   r>  r9   pad_token_idr   r*   r   r   int32r(   argmaxloggerwarning_oncer^   r~   problem_typer=  r$   r   rT   r	   squeezer   r   r   r   r   r   )r]   r   r   rB   r   r   rC   r   r  rr   rs   r   r   r$  r   r"  rl   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr!  loss_fctrJ   s                            r   r{   %CTRLForSequenceClassification.forward\  s   P &1%<k$++B]B]"..+))%'/!5# / 
 ,A./ *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r!   )r>  r=  r   r6  )r~   r   r   r   rR   r   r   r   r  r   r  r  r   r  r   r{   r   r   r   s   @r   r;  r;  G  sW     15EI6:59371559-1$(,0/3&*p
E,,-p
 "%e.?.?(@"ABp
 !!2!23	p

 !!1!12p
 u//0p
 E--.p
   1 12p
 ))*p
 D>p
 $D>p
 'tnp
 d^p
 
uU\\"$<<	=p
 p
r!   r;  )r;  r  r   r   r7  )-__doc__typingr   r   r   numpyr:   r   r   torch.nnr   r   r	   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_ctrlr   
get_loggerr~   rG  r    r4   rK   ModulerM   r   r   r   r   r  r;  __all__r   r!   r   <module>r_     s     ) )    A A ) i i - Y Y + 
		H	%
%4L L^`&299 &R */ * ** @
# @
 @
F S
)? S
S
l 
{
$7 {

{
| cr!   