o
    Zh                    @   s  d Z ddlZddlmZmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, e*-e.Z/G dd dej0Z1G dd dej0Z2G dd de2Z3G dd dej0Z4e2e3dZ5G dd dej0Z6G dd dej0Z7G dd dej0Z8G d d! d!ej0Z9G d"d# d#ej0Z:G d$d% d%ej0Z;e(G d&d' d'e"Z<G d(d) d)ej0Z=G d*d+ d+ej0Z>e(G d,d- d-e<Z?e(G d.d/ d/e<Z@e(d0d1G d2d3 d3e<ZAe(G d4d5 d5e<ZBe(G d6d7 d7e<ZCe(G d8d9 d9e<ZDe(d:d1G d;d< d<e<eZEd@d=d>ZFg d?ZGdS )AzPyTorch CamemBERT model.    N)ListOptionalTupleUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringget_torch_versionlogging   )CamembertConfigc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)padding_idxZepsposition_embedding_typeabsoluteposition_ids)r    F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr%   Zregister_buffertorcharangeexpandzerosr'   sizelongr#   selfconfig	__class__ _/var/www/auris/lib/python3.10/site-packages/transformers/models/camembert/modeling_camembert.pyr.   8   s"   
zCamembertEmbeddings.__init__Nr   c                 C   s   |d u r|d urt || j|}n| |}|d ur| }n| d d }|d }|d u rTt| drI| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r]| |}| |}
||
 }| jdkrt| |}||7 }| |}| |}|S )Nr(   r    r*   r   r,   devicer&   )"create_position_ids_from_input_idsr#   &create_position_ids_from_inputs_embedsr@   hasattrr*   r>   r<   r?   rA   r'   rJ   r2   r5   r%   r4   r6   r:   )rC   	input_idsr*   r'   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr5   
embeddingsr4   rG   rG   rH   forwardQ   s0   








zCamembertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr(   r    rI   r   )r@   r<   r=   r#   rA   rJ   Z	unsqueezer>   )rC   rO   rQ   Zsequence_lengthr'   rG   rG   rH   rL   y   s   	z:CamembertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r.   rV   rL   __classcell__rG   rG   rE   rH   r"   2   s    
(r"   c                       s   e Zd Zd fdd	ZdejdejfddZ						dd	ejd
eej deej deej deej dee	e	ej   dee
 de	ej fddZ  ZS )CamembertSelfAttentionNc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r%   r&   relative_keyrelative_key_query   r    )r-   r.   r1   num_attention_headsrM   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer8   attention_probs_dropout_probr:   r;   r%   r3   r/   distance_embedding
is_decoderrC   rD   r%   rE   rG   rH   r.      s*   

zCamembertSelfAttention.__init__xreturnc                 C   s6   |  d d | j| jf }||}|ddddS )Nr(   r   r`   r    r   )r@   ra   rd   viewpermute)rC   rn   Znew_x_shaperG   rG   rH   transpose_for_scores   s   
z+CamembertSelfAttention.transpose_for_scoresFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 C   s  |  |}|d u}	|	r|d ur|d }
|d }|}nP|	r/| | |}
| | |}|}n;|d urZ| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|d u}| jrz|
|f}t||
dd}| j	dks| j	dkr	|j
d |
j
d }}|rtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n| j	dkr	td||}td|
|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur0|| }t||}|dddd }| d d | jf }||}|rX||fn|f}| jrd||f }|S )Nr   r    r`   dimr(   r^   r_   rI   r+   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) rg   rr   rh   ri   r<   catrl   matmul	transposer%   shapeZtensorrA   rJ   rp   r=   rk   r3   tor,   Zeinsummathsqrtrd   r   
functionalZsoftmaxr:   rq   
contiguousr@   re   )rC   rs   rt   ru   rv   rw   rx   ry   Zmixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheZattention_scoresZquery_lengthZ
key_lengthZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shapeoutputsrG   rG   rH   rV      sn   









zCamembertSelfAttention.forwardNNNNNNF)rW   rX   rY   r.   r<   Tensorrr   r   FloatTensorr   boolrV   r[   rG   rG   rE   rH   r\      s4    	r\   c                       s   e Zd Zd fdd	Z						ddejdeej deej deej d	eej d
eeeej   dee	 deej f fddZ
  ZS )CamembertSdpaSelfAttentionNc                    s4   t  j||d |j| _tt tdk | _d S )Nr%   z2.2.0)r-   r.   rj   dropout_probr   parser   require_contiguous_qkvrm   rE   rG   rH   r.     s   z#CamembertSdpaSelfAttention.__init__Frs   rt   ru   rv   rw   rx   ry   ro   c              	      s  | j dks|s|d urtd t |||||||S | \}}	}
| | |}|d u}|r3|n|}|r9|n|}|rP|rP|d jd |jd krP|\}}n,| | 	|}| | 
|}|d ur||s|tj|d |gdd}tj|d |gdd}| jr||f}| jr|jjdkr|d ur| }| }| }| jr|s|d u r|	dkrdnd	}tjjj||||| jr| jnd
|d}|dd}|||	| j}|f}| jr||f }|S )Nr&   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r`   r    rz   cudaTF        )Z	attn_maskZ	dropout_p	is_causal)r%   loggerwarning_oncer-   rV   r@   rr   rg   r   rh   ri   r<   r}   rl   r   rJ   typer   r   r   Zscaled_dot_product_attentiontrainingr   r   Zreshapere   )rC   rs   rt   ru   rv   rw   rx   ry   Zbsztgt_len_r   r   Zcurrent_statesr   r   r   Zattn_outputr   rE   rG   rH   rV     s^   

 
 	
z"CamembertSdpaSelfAttention.forwardr   r   )rW   rX   rY   r.   r<   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r     s2    		r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )CamembertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr$   )r-   r.   r   rf   r1   denser6   r7   r8   r9   r:   rB   rE   rG   rH   r.   z     
zCamembertSelfOutput.__init__rs   input_tensorro   c                 C   &   |  |}| |}| || }|S r   r   r:   r6   rC   rs   r   rG   rG   rH   rV        

zCamembertSelfOutput.forwardrW   rX   rY   r.   r<   r   rV   r[   rG   rG   rE   rH   r   y      $r   )eagersdpac                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )CamembertAttentionNc                    s4   t    t|j ||d| _t|| _t | _d S )Nr   )	r-   r.    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrC   r   outputsetpruned_headsrm   rE   rG   rH   r.     s   

zCamembertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r    rz   )lenr   rC   ra   rd   r   r   rg   rh   ri   r   r   re   union)rC   headsindexrG   rG   rH   prune_heads  s   zCamembertAttention.prune_headsFrs   rt   ru   rv   rw   rx   ry   ro   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r    )rC   r   )rC   rs   rt   ru   rv   rw   rx   ry   Zself_outputsattention_outputr   rG   rG   rH   rV     s   
	zCamembertAttention.forwardr   r   )rW   rX   rY   r.   r   r<   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r     s4    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )CamembertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r-   r.   r   rf   r1   intermediate_sizer   
isinstanceZ
hidden_actstrr   intermediate_act_fnrB   rE   rG   rH   r.     s
   
zCamembertIntermediate.__init__rs   ro   c                 C   s   |  |}| |}|S r   )r   r   )rC   rs   rG   rG   rH   rV     s   

zCamembertIntermediate.forwardr   rG   rG   rE   rH   r     s    r   c                       r   )CamembertOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r-   r.   r   rf   r   r1   r   r6   r7   r8   r9   r:   rB   rE   rG   rH   r.     r   zCamembertOutput.__init__rs   r   ro   c                 C   r   r   r   r   rG   rG   rH   rV     r   zCamembertOutput.forwardr   rG   rG   rE   rH   r     r   r   c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )CamembertLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _d S )Nr    z> should be used as a decoder model if cross attention is addedr&   r   )r-   r.   chunk_size_feed_forwardseq_len_dimr   	attentionrl   add_cross_attentionrb   crossattentionr   intermediater   r   rB   rE   rG   rH   r.     s   


zCamembertLayer.__init__NFrs   rt   ru   rv   rw   rx   ry   ro   c              	   C   s  |d ur
|d d nd }| j |||||d}	|	d }
| jr(|	dd }|	d }n|	dd  }d }| jro|d urot| dsDtd|  d|d urN|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
Nr`   )ry   rx   r   r    r(   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r|   )	r   rl   rM   rb   r   r   feed_forward_chunkr   r   )rC   rs   rt   ru   rv   rw   rx   ry   Zself_attn_past_key_valueZself_attention_outputsr   r   Zpresent_key_valueZcross_attn_present_key_valueZcross_attn_past_key_valueZcross_attention_outputslayer_outputrG   rG   rH   rV     sP   


	

zCamembertLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rC   r   Zintermediate_outputr   rG   rG   rH   r   1  s   
z!CamembertLayer.feed_forward_chunkr   )rW   rX   rY   r.   r<   r   r   r   r   r   rV   r   r[   rG   rG   rE   rH   r     s4    	
Ar   c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS rG   )r   ).0r   rD   rG   rH   
<listcomp><  s    z-CamembertEncoder.__init__.<locals>.<listcomp>F)	r-   r.   rD   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingrB   rE   r   rH   r.   9  s   
 
zCamembertEncoder.__init__NFTrs   rt   ru   rv   rw   past_key_valuesr   ry   output_hidden_statesreturn_dictro   c                 C   s^  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r)dnd }t| jD ]^\}}|	r;||f }|d urC|| nd }|d urM|| nd }| jrc| jrc| |j	|||||||}n
||||||||}|d }|rz||d f7 }|r||d f }| j jr||d f }q0|	r||f }|
st
dd	 |||||fD S t|||||d
S )NrG   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r(   r    r`   c                 s   s    | ]	}|d ur|V  qd S r   rG   )r   vrG   rG   rH   	<genexpr>  s    z+CamembertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   rs   
attentionscross_attentions)rD   r   r   r   r   r   	enumerater   Z_gradient_checkpointing_func__call__tupler   )rC   rs   rt   ru   rv   rw   r   r   ry   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsZnext_decoder_cacheiZlayer_moduleZlayer_head_maskrx   Zlayer_outputsrG   rG   rH   rV   ?  sz   


zCamembertEncoder.forward)	NNNNNNFFT)rW   rX   rY   r.   r<   r   r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r   8  sD    		
r   c                       r   )CamembertPoolerc                    s*   t    t|j|j| _t | _d S r   )r-   r.   r   rf   r1   r   ZTanh
activationrB   rE   rG   rH   r.     s   
zCamembertPooler.__init__rs   ro   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )rC   rs   Zfirst_token_tensorpooled_outputrG   rG   rH   rV     s   

zCamembertPooler.forwardr   rG   rG   rE   rH   r     s    r   c                   @   s$   e Zd ZeZdZdZdZdd ZdS )CamembertPreTrainedModelrobertaTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weightsr   )meanZstdNg      ?)r   r   rf   weightdataZnormal_rD   Zinitializer_rangebiasZzero_r/   r#   r6   Zfill_CamembertLMHead)rC   modulerG   rG   rH   _init_weights  s    


z&CamembertPreTrainedModel._init_weightsN)	rW   rX   rY   r!   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_sdpar   rG   rG   rG   rH   r     s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _d S r   )r-   r.   r   rf   r1   r   classifier_dropoutr9   r8   r:   
num_labelsout_projrC   rD   r   rE   rG   rH   r.     s   
z$CamembertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )r:   r   r<   tanhr   rC   featureskwargsrn   rG   rG   rH   rV     s   




z#CamembertClassificationHead.forward)rW   rX   rY   rZ   r.   rV   r[   rG   rG   rE   rH   r     s    	r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r   z,Camembert Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S r   )r-   r.   r   rf   r1   r   r6   r7   
layer_normr0   decoder	Parameterr<   r?   r   rB   rE   rG   rH   r.     s   
zCamembertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r   r   r   r   rG   rG   rH   rV     s
   


zCamembertLMHead.forwardc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r   r   rJ   r   rC   rG   rG   rH   _tie_weights  s   zCamembertLMHead._tie_weights)rW   rX   rY   rZ   r.   rV   r   r[   rG   rG   rE   rH   r     s
    	
r   c                        s   e Zd ZdZg Zd fdd	Zdd Zdd Zd	d
 Ze														dde
ej de
ej de
ej de
ej de
ej de
ej de
ej de
ej de
eej  de
e de
e de
e de
e deeej ef fddZ  ZS )CamembertModela)  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    Tc                    sT   t  | || _t|| _t|| _|rt|nd| _|j	| _
|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r-   r.   rD   r"   rU   r   encoderr   poolerr   attn_implementationr%   	post_init)rC   rD   add_pooling_layerrE   rG   rH   r.     s   

zCamembertModel.__init__c                 C      | j jS r   rU   r2   r   rG   rG   rH   get_input_embeddings     z#CamembertModel.get_input_embeddingsc                 C      || j _d S r   r   )rC   ri   rG   rG   rH   set_input_embeddings!     z#CamembertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )rC   Zheads_to_pruner   r   rG   rG   rH   _prune_heads$  s   zCamembertModel._prune_headsNrN   rt   r*   r'   ru   rO   rv   rw   r   r   ry   r   r   ro   c                  C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j jr-|
d ur(|
n| j j}
nd}
|d ur;|d ur;td|d urJ| || | }n|d urW| d d }ntd|\}}|d urf|j	n|j	}|	d urv|	d d j
d nd}|d u rt| jdr| jjd d d |f }|||}|}n	tj|tj|d}| j|||||d	}|d u rtj||| f|d
}| jdko| jdko|d u o| }|r| dkr| j jrt||||}nt||j|d}n| ||}| j jr'|d ur'| \}}}||f}|d u rtj||d
}|r!| dkr!t||j|d}n| |}nd }| || j j}| j||||||	|
|||d
}|d }| jd urO| |nd }|s^||f|dd   S t|||j |j!|j"|j#dS )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embedsr   r`   r*   rI   )rN   r'   r*   rO   rP   )rJ   r   r&   )r   )	rt   ru   rv   rw   r   r   ry   r   r   r    )r   Zpooler_outputr   rs   r   r   )$rD   ry   r   use_return_dictrl   r   rb   Z%warn_if_padding_and_no_attention_maskr@   rJ   r   rM   rU   r*   r>   r<   r?   rA   Zonesr   r%   r{   r   r   r,   Zget_extended_attention_maskZinvert_attention_maskZget_head_maskr   r   r   r   r   rs   r   r   ) rC   rN   rt   r*   r'   ru   rO   rv   rw   r   r   ry   r   r   rQ   Z
batch_sizerR   rJ   rP   rS   rT   Zembedding_outputZuse_sdpa_attention_masksZextended_attention_maskZencoder_batch_sizeZencoder_sequence_lengthr   Zencoder_hidden_shapeZencoder_extended_attention_maskZencoder_outputssequence_outputr   rG   rG   rH   rV   ,  s   


zCamembertModel.forward)T)NNNNNNNNNNNNN)rW   rX   rY   rZ   Z_no_split_modulesr.   r  r  r  r   r   r<   r   r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r     sd    	
r   c                       s   e Zd ZddgZ fddZdd Zdd Ze																								dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 dee dee dee deee	j ef fddZ  ZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr   
r-   r.   rl   r   warningr   r   r   lm_headr   rB   rE   rG   rH   r.     s   
zCamembertForMaskedLM.__init__c                 C   r   r   r  r   r   rG   rG   rH   get_output_embeddings  r  z*CamembertForMaskedLM.get_output_embeddingsc                 C   r  r   r  rC   Znew_embeddingsrG   rG   rH   set_output_embeddings  r  z*CamembertForMaskedLM.set_output_embeddingsNrN   rt   r*   r'   ru   rO   rv   rw   labelsry   r   r   ro   c                 C   s   |dur|n| j j}| j|||||||||
||d}|d }| |}d}|	dur@|	|j}	t }||d| j j|	d}|sV|f|dd  }|durT|f| S |S t	|||j
|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
rt   r*   r'   ru   rO   rv   rw   ry   r   r   r   r(   r`   losslogitsrs   r   )rD   r  r   r  r   rJ   r	   rp   r0   r   rs   r   )rC   rN   rt   r*   r'   ru   rO   rv   rw   r  ry   r   r   r   r	  prediction_scoresZmasked_lm_lossloss_fctr   rG   rG   rH   rV     s<   
zCamembertForMaskedLM.forward)NNNNNNNNNNNN)rW   rX   rY   _tied_weights_keysr.   r  r  r   r   r<   
LongTensorr   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r
    sZ    	
r
  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )Zcustom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )"CamembertForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFr  )	r-   r.   r   rD   r   r   r   
classifierr   rB   rE   rG   rH   r.   #  s   
z+CamembertForSequenceClassification.__init__NrN   rt   r*   r'   ru   rO   r  ry   r   r   ro   c                 C   st  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur||j}| j jdu rW| jdkr=d| j _n| jdkrS|jt	j
ksN|jt	jkrSd| j _nd| j _| j jdkrut }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrt   r*   r'   ru   rO   ry   r   r   r   r    Z
regressionZsingle_label_classificationZmulti_label_classificationr(   r`   r  )rD   r  r   r   r   rJ   Zproblem_typer   r,   r<   rA   rc   r
   squeezer	   rp   r   r   rs   r   rC   rN   rt   r*   r'   ru   rO   r  ry   r   r   r   r	  r  r  r  r   rG   rG   rH   rV   .  sV   


"


z*CamembertForSequenceClassification.forward
NNNNNNNNNN)rW   rX   rY   r.   r   r   r<   r  r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r    sH    	
r  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr    )r-   r.   r   r   r   r8   r9   r:   rf   r1   r   r   rB   rE   rG   rH   r.     s
   
z#CamembertForMultipleChoice.__init__NrN   r*   rt   r  r'   ru   rO   ry   r   r   ro   c                 C   sz  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dur||j	}t
 }|||}|
s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr    r(   r|   )r'   r*   rt   ru   rO   ry   r   r   r`   r  )rD   r  r   rp   r@   r   r:   r   r   rJ   r	   r   rs   r   )rC   rN   r*   rt   r  r'   ru   rO   ry   r   r   Znum_choicesZflat_input_idsZflat_position_idsZflat_token_type_idsZflat_attention_maskZflat_inputs_embedsr   r   r  Zreshaped_logitsr  r  r   rG   rG   rH   rV     sN   -


z"CamembertForMultipleChoice.forwardr$  )rW   rX   rY   r.   r   r   r<   r  r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r%    sH    
	
r%  c                       r  )CamembertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r  )r-   r.   r   r   r   r   r9   r   r8   r:   rf   r1   r   r   r   rE   rG   rH   r.     s   z(CamembertForTokenClassification.__init__NrN   rt   r*   r'   ru   rO   r  ry   r   r   ro   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|durB||j}t }||d| j	|d}|
sX|f|dd  }|durV|f| S |S t
|||j|jdS )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr!  r   r(   r`   r  )rD   r  r   r:   r   r   rJ   r	   rp   r   r   rs   r   r#  rG   rG   rH   rV     s:   

z'CamembertForTokenClassification.forwardr$  )rW   rX   rY   r.   r   r   r<   r  r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r&    sH    	
r&  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
r-   r.   r   r   r   r   rf   r1   
qa_outputsr   rB   rE   rG   rH   r.   @  s
   z&CamembertForQuestionAnswering.__init__NrN   rt   r*   r'   ru   rO   start_positionsend_positionsry   r   r   ro   c                 C   sH  |dur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr!  r   r    r(   rz   )Zignore_indexr`   )r  start_logits
end_logitsrs   r   )rD   r  r   r(  splitr"  r   r   r@   clampr	   r   rs   r   )rC   rN   rt   r*   r'   ru   rO   r)  r*  ry   r   r   r   r	  r  r+  r,  Z
total_lossZignored_indexr  Z
start_lossZend_lossr   rG   rG   rH   rV   J  sP   






z%CamembertForQuestionAnswering.forward)NNNNNNNNNNN)rW   rX   rY   r.   r   r   r<   r  r   r   r   r   r   r   rV   r[   rG   rG   rE   rH   r'  =  sN    
	
r'  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c                "       s  e Zd ZddgZ fddZdd Zdd Ze																												dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 deeee	j   dee dee dee dee deee	j ef fddZdd Z  ZS )CamembertForCausalLMr  r  c                    s@   t  | |jstd t|dd| _t|| _| 	  d S )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rB   rE   rG   rH   r.     s   

zCamembertForCausalLM.__init__c                 C   r   r   r  r   rG   rG   rH   r    r  z*CamembertForCausalLM.get_output_embeddingsc                 C   r  r   r  r  rG   rG   rH   r    r  z*CamembertForCausalLM.set_output_embeddingsNrN   rt   r*   r'   ru   rO   rv   rw   r  r   r   ry   r   r   ro   c                 K   s   |dur|n| j j}|	durd}| j|||||||||
||||d}|d }| |}d}|	durE|	|j}	| j||	fd| j ji|}|s[|f|dd  }|durY|f| S |S t|||j	|j
|j|jdS )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)rt   r*   r'   ru   rO   rv   rw   r   r   ry   r   r   r   r0   r`   )r  r  r   rs   r   r   )rD   r  r   r  r   rJ   Zloss_functionr0   r   r   rs   r   r   )rC   rN   rt   r*   r'   ru   rO   rv   rw   r  r   r   ry   r   r   r   r   r	  r  Zlm_lossr   rG   rG   rH   rV     sT   2
zCamembertForCausalLM.forwardc                    s.   d}|D ]}|t  fdd|D f7 }q|S )NrG   c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectr   rJ   )r   Z
past_statebeam_idxrG   rH   r     s   " z6CamembertForCausalLM._reorder_cache.<locals>.<genexpr>)r   )rC   r   r1  Zreordered_pastZ
layer_pastrG   r0  rH   _reorder_cache  s   z#CamembertForCausalLM._reorder_cache)NNNNNNNNNNNNNN)rW   rX   rY   r  r.   r  r  r   r   r<   r  r   r   r   r   r   r   rV   r2  r[   rG   rG   rE   rH   r/    sh    	
`r/  c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r    rz   )nerc   r<   ZcumsumZtype_asrA   )rN   r#   rP   maskZincremental_indicesrG   rG   rH   rK     s   rK   )r/  r
  r%  r'  r  r&  r   r   )r   )HrZ   r   typingr   r   r   r   r<   Ztorch.utils.checkpoint	packagingr   r   Ztorch.nnr   r	   r
   Zactivationsr   r   Z
generationr   Zmodeling_attn_mask_utilsr   r   Zmodeling_outputsr   r   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   r   Zconfiguration_camembertr!   Z
get_loggerrW   r   Moduler"   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r
  r  r%  r&  r'  r/  rK   __all__rG   rG   rG   rH   <module>   sr   (

Z f4W^ F\^iPX 
