o
    ZhP~                     @   s^  d Z ddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ eeZ dd Z!dd Z"d&ddZ#G dd dej$Z%dd Z&G dd dej$Z'eG dd deZ(eG dd de(Z)eddG d d! d!e(eZ*ed"dG d#d$ d$e(Z+g d%Z,dS )'zPyTorch CTRL model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 C   s$   dt dd|d  |  }| | S )Nr   i'     )torchpow)posid_model_sizeZangle_rates r   U/var/www/auris/lib/python3.10/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defn'   s   r   c                 C   s   t tj| tjd|dtj|tjd|d|}t|d d dd df }t|d d dd df }tj||gdd}|S )Ndtyper   r   r   dim)	r   r   arangeZint64to	unsqueezesincoscat)positionr   r    Z
angle_radsZsinesZcosinespos_encodingr   r   r   positional_encoding,   s   r,   c              	   C   s   t | |dddd}|jd }|t| }|d ur7|d|d}	}
|||
|	 |
d |
f d 7 }|d ur?|| }t j|dd}|d urN|| }t ||}||fS )	Nr   r   r	   r   r!   g     r"   )r   matmulpermuteshapenpsqrtsizeZsoftmax)qkvmaskattention_mask	head_maskZ	matmul_qkZdkZscaled_attention_logitsndnsZattention_weightsoutputr   r   r   scaled_dot_product_attention;   s   
 r=   c                       s@   e Zd Z fddZdd Zdd Z					dd	d
Z  ZS )MultiHeadAttentionc                    sj   t    || _|| _t|| j | _t||| _t||| _	t||| _
t||| _t | _d S N)super__init__	num_headsr   intdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rB   	__class__r   r   rA   V   s   
zMultiHeadAttention.__init__c                 C   s   | j | j }t|dkrd S t|| j|| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt| | _|| j | _ | j
|| _d S )Nr   r   r"   )r   rB   lenr   rK   r   rF   rG   rH   rI   union)rL   headsZattention_head_sizeindexr   r   r   prune_headsd   s   zMultiHeadAttention.prune_headsc                 C   s"   | |d| j| j}|g dS )Nr!   r   r   r   r	   )reshaperB   rD   r/   )rL   x
batch_sizer   r   r   split_into_headsu   s   z#MultiHeadAttention.split_into_headsNFc
                 C   s  |j d }
| |}| |}| |}| ||
}| ||
}| ||
}|d urE|d |d }}tj||fdd}tj||fdd}|du rQt||f}nd}t||||||}|d 	g d}|d }|
|
d| j}| |}||f}|	r||f }|S )	Nr   r   r-   r"   Tr?   rT   r!   )r0   rF   rG   rH   rX   r   r)   stackr=   r/   rU   r   rI   )rL   r6   r5   r4   r7   
layer_pastr8   r9   	use_cacheoutput_attentionsrW   Zpast_keyZ
past_valuepresentr<   Zscaled_attentionZattnZoriginal_size_attentionoutputsr   r   r   forwardy   s.   





zMultiHeadAttention.forwardNNNFF)__name__
__module____qualname__rA   rS   rX   r_   __classcell__r   r   rM   r   r>   U   s    
r>   c                 C   s"   t t | |t  t || S r?   )r   Z
SequentialrE   ZReLU)r   dffr   r   r   point_wise_feed_forward_network   s   "rf   c                       s*   e Zd Zd fdd	Z	d	ddZ  ZS )
EncoderLayer皙?c                    s^   t    t||| _t||| _tj|dd| _tj|dd| _	t
|| _t
|| _d S )Ngư>Zeps)r@   rA   r>   multi_head_attentionrf   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rL   r   rB   re   ZraterM   r   r   rA      s   
zEncoderLayer.__init__NFc                 C   s|   |  |}| j|||||||||d	}	|	d }
| |
}
||
 }| |}| |}| |}|| }|f|	dd   }|S )NrZ   r8   r9   r[   r\   r   r   )rm   rj   rp   rn   rk   rq   )rL   rV   r7   rZ   r8   r9   r[   r\   ZnormedZattn_outputsZattn_outputZout1Zout2Z
ffn_outputr^   r   r   r   r_      s*   




zEncoderLayer.forward)rh   r`   )ra   rb   rc   rA   r_   rd   r   r   rM   r   rg      s    rg   c                   @   s   e Zd ZeZdZdd ZdS )CTRLPreTrainedModeltransformerc                 C   s   t |tjtfr"|jjjd| jjd |j	dur |j	j
  dS dS t |tjrE|jjjd| jjd |jdurC|jj|j 
  dS dS t |tjrZ|j	j
  |jjd dS dS )zInitialize the weights.g        )meanZstdN      ?)
isinstancer   rE   r   weightdataZnormal_configZinitializer_rangebiasZzero_	EmbeddingZpadding_idxrl   Zfill_)rL   moduler   r   r   _init_weights   s   

z!CTRLPreTrainedModel._init_weightsN)ra   rb   rc   r   Zconfig_classZbase_model_prefixr~   r   r   r   r   rs      s    rs   c                       s   e Zd Z fddZdd Zdd Zdd Ze																						dd
ee	j
 deeee	j   dee	j dee	j
 dee	j
 dee	j dee	j dee dee dee dee deee	j ef fddZ  ZS )	CTRLModelc                    s   t     j| _ j| _t j| jtj	| _
t j j| _t j| _t fddt jD | _tj j jd| _|   d S )Nc                    s"   g | ]}t  j j j jqS r   )rg   n_embdZn_headre   Zresid_pdrop).0_rz   r   r   
<listcomp>   s   " z&CTRLModel.__init__.<locals>.<listcomp>ri   )r@   rA   r   r   n_layerZ
num_layersr,   Zn_positionsr   floatr+   r   r|   
vocab_sizewro   Z
embd_pdropdropoutZ
ModuleListrangehrl   Zlayer_norm_epsilon	layernorm	post_initrL   rz   rM   r   r   rA      s   zCTRLModel.__init__c                 C      | j S r?   r   rL   r   r   r   get_input_embeddings      zCTRLModel.get_input_embeddingsc                 C   
   || _ d S r?   r   rL   Znew_embeddingsr   r   r   set_input_embeddings      
zCTRLModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr   rj   rS   )rL   Zheads_to_prunelayerrQ   r   r   r   _prune_heads  s   zCTRLModel._prune_headsN	input_idspast_key_valuesr8   token_type_idsposition_idsr9   inputs_embedsr[   r\   output_hidden_statesreturn_dictreturnc              
   K   sZ  |	dur|	n| j j}	|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|dur4|dur4td|durP| || | }|d|d }|j	d }n|durb| dd }|j	d }ntd|durm|j
n|j
}|du rd}tdgt| j }n	|d d d}|du rtj||d | tj|d}|d}|dur|dkrtd||d}|d	d
}|j| jd}d| t| jj }| || j j}|dur|d|d }| |}|t| j9 }nd}|du r| |}|d }tt|| || d	|}|t| j9 }| j|| _| j|ddf }|| | }| |}|r7dnd}|
r>dnd}|	rEdnd}t t!| j|D ]8\}\}}|
r]||f }||||||| ||	d}|dd
 \}}|du r|||f }|	r||d
 f7 }qO| "|}|
r||f }|stdd ||||fD S t#||||dS )aE  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   r   z5You have to specify either input_ids or inputs_embedsr-   )r    devicez$batch_size has to be defined and > 0r   r   r   rv   r   rr   Tc                 s   s    | ]	}|d ur|V  qd S r?   r   )r   r6   r   r   r   	<genexpr>  s    z$CTRLModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   hidden_states
attentions)$rz   r\   r[   r   use_return_dict
ValueErrorZ%warn_if_padding_and_no_attention_maskr3   viewr0   r   tuplerO   r   r   r$   longr&   r%   r    ZfinfominZget_head_maskr   r   r1   r2   r   ZtriuZonesr+   r   	enumeratezipr   r   )rL   r   r   r8   r   r   r9   r   r[   r\   r   r   kwargsZinput_shaperW   r   past_lengthZtoken_type_embedsZseq_lenr7   Z
pos_embedsr   ZpresentsZall_hidden_statesZall_attentionsr   r   rZ   r^   r]   r   r   r   r_   	  s   0


"

	



zCTRLModel.forward)NNNNNNNNNNN)ra   rb   rc   rA   r   r   r   r   r   r   
LongTensorr   FloatTensorboolr   Tensorr   r_   rd   r   r   rM   r   r      sT    	
r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )Zcustom_introc                       s  e Zd ZdgZ fddZdd Zdd Ze												dd	ee	j
 d
eeee	j   dee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee dee deee	j ef fddZdddZed
eee	j  de	jdeee	j  fddZ  ZS )CTRLLMHeadModelzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NTr{   )
r@   rA   r   rt   r   rE   r   r   lm_headr   r   rM   r   r   rA     s   
zCTRLLMHeadModel.__init__c                 C   r   r?   r   r   r   r   r   get_output_embeddings  r   z%CTRLLMHeadModel.get_output_embeddingsc                 C   r   r?   r   r   r   r   r   set_output_embeddings  r   z%CTRLLMHeadModel.set_output_embeddingsNr   r   r8   r   r   r9   r   labelsr[   r\   r   r   r   c                 K   s   |dur|n| j j}| j||||||||	|
||d}|d }| |}d}|dur7| j||fd| j ji|}|sM|f|dd  }|durK|f| S |S t|||j|j|j	dS )a
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N
r   r8   r   r   r9   r   r[   r\   r   r   r   r   r   )losslogitsr   r   r   )
rz   r   rt   r   Zloss_functionr   r   r   r   r   )rL   r   r   r8   r   r   r9   r   r   r[   r\   r   r   r   transformer_outputsr   Z	lm_logitsr   r<   r   r   r   r_     sH   ;
zCTRLLMHeadModel.forwardc                 K   s\   |d ur(|d d j d }|j d |kr|}n|j d d }|d d |d f }|||dS )Nr   r   r   )r   r   r[   )r0   )rL   r   r   r[   r   r   Zremove_prefix_lengthr   r   r   prepare_inputs_for_generation&  s   z-CTRLLMHeadModel.prepare_inputs_for_generationbeam_idxc                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectr%   r   )r   Z
past_stater   r   r   r   B  s   " z;CTRLLMHeadModel._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   rZ   r   r   r   r   A  s
    
z1CTRLLMHeadModel._reorder_cache.<locals>.<genexpr>r   )r   r   r   r   r   _reorder_cache8  s   	zCTRLLMHeadModel._reorder_cacheNNNNNNNNNNNNNN)ra   rb   rc   Z_tied_weights_keysrA   r   r   r   r   r   r   r   r   r   r   r   r   r_   r   staticmethodr   rd   r   r   rM   r   r     sl    	

cr   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       s   e Zd Z fddZe												ddeej deeeej	   deej	 deej deej d	eej	 d
eej	 deej dee
 dee
 dee
 dee
 deeej ef fddZ  ZS )CTRLForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFr   )
r@   rA   
num_labelsr   rt   r   rE   r   
classifierr   r   rM   r   r   rA   S  s
   
z&CTRLForSequenceClassification.__init__Nr   r   r8   r   r   r9   r   r   r[   r\   r   r   r   c                 C   sJ  |dur|n| j j}| j||||||||	|
||d}|d }| |}|dur1|jdd \}}n	|jdd \}}| j jdu rH|dkrHtd| j jdu rQd}n1|durv|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a2  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```Nr   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r!   )r   r    z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   Z
regressionZsingle_label_classificationZmulti_label_classification)r   r   r   r   )rz   r   rt   r   r0   Zpad_token_idr   r%   r   r   Zint32r$   ZargmaxloggerZwarning_oncerN   ra   Zproblem_typer   r    r   rC   r   Zsqueezer   r   r   r   r   r   )rL   r   r   r8   r   r   r9   r   r   r[   r\   r   r   r   r   r   rW   Zsequence_lengthZlast_non_pad_tokenZnon_pad_maskZtoken_indicesZpooled_logitsr   Zloss_fctr<   r   r   r   r_   \  sx   h



"


z%CTRLForSequenceClassification.forwardr   )ra   rb   rc   rA   r   r   r   r   r   r   r   r   r   r   r_   rd   r   r   rM   r   r   G  sT    		
r   )r   r   r   rs   r   )-__doc__typingr   r   r   numpyr1   r   r   Ztorch.nnr   r   r   Z
generationr
   Zmodeling_outputsr   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   Zconfiguration_ctrlr   Z
get_loggerra   r   r   r,   r=   Moduler>   rf   rg   rs   r   r   r   __all__r   r   r   r   <module>   sF   

O) D  ?