o
    Zhl                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% e#&e'Z(dd Z)G dd dej*Z+e"G dd deZ,G dd dej*Z-G dd dej*Z.G dd dej*Z/G dd dej*Z0G dd dej*Z1G dd  d ej*Z2G d!d" d"ej*Z3G d#d$ d$ej*Z4G d%d& d&ej*Z5G d'd( d(ej*Z6G d)d* d*ej*Z7e"G d+d, d,e,Z8G d-d. d.ej*Z9e"G d/d0 d0e,Z:G d1d2 d2ej*Z;e"d3d4G d5d6 d6e,Z<e"G d7d8 d8e,Z=e"G d9d: d:e,Z>e"G d;d< d<e,Z?g d=Z@dS )>zPyTorch ConvBERT model.    N)
attrgetter)CallableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )ConvBertConfigc                 C   s  zddl }W n ty   td  w tj|}td|  |j	|}i }|D ]\}}td| d|  |j
||}	|	||< q,ddd	d
dddd}
|jdkrYd}nd}t|jD ]"}d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d |
d| d!< d| d"|
d| d#< d| d$|
d| d%< d| d&|
d| d'< d| d(|
d| d)< d| d*|
d| d+< d| d,|
d| d-< d| d.|
d| d/< d| d0|
d| d1< d| d2|
d| d3< d| d4|
d| d5< d| d6| d7|
d| d8< d| d6| d9|
d| d:< d| d;| d7|
d| d<< d| d;| d9|
d| d=< d| d>|
d| d?< d| d@|
d| dA< q`|  D ]c}|d }t|}|| }|
| }t|| }tdB| dC| dD |d7r|dEs|dFs|j}|dGr|ddHd}|dIr|dHdd}|dJr|dK}||_q| S )Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   Zg_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)Z
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variable
num_groupsrangenum_hidden_layersZnamed_parametersr   torchZ
from_numpyendswithTpermute	unsqueezedata)modelconfigZtf_checkpoint_pathtfZtf_pathZ	init_varsZtf_datanameshapearrayZparam_mappingZgroup_dense_namejparam
param_nameZ	retrieverresultZtf_namevalue r<   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbert0   s   

	



















r>   c                       sb   e Zd ZdZ fddZ				ddeej deej deej deej d	ejf
d
dZ	  Z
S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxZepsposition_ids)r   r   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizeZpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferr+   ZarangeexpandzerosrB   sizelongselfr2   	__class__r<   r=   rH      s   

zConvBertEmbeddings.__init__N	input_idsrD   rB   inputs_embedsreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	| |}
||	 |
 }| |}| |}|S )Nr   r   rD   r   rF   device)rV   rB   hasattrrD   rT   r+   rU   rW   r`   rL   rM   rN   rO   rS   )rY   r\   rD   rB   r]   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrM   rN   
embeddingsr<   r<   r=   forward   s(   






zConvBertEmbeddings.forward)NNNN)__name__
__module____qualname____doc__rH   r   r+   
LongTensorFloatTensorrg   __classcell__r<   r<   rZ   r=   r?      s$    r?   c                   @   s$   e Zd ZeZeZdZdZdd Z	dS )ConvBertPreTrainedModelconvbertTc                 C   s  t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjr[|j	j
  |jjd dS t |trh|j	j
  dS t |tr|jjjd| jjd |j	j
  dS dS )zInitialize the weights        meanZstdNg      ?)
isinstancer   LinearConv1dweightr0   normal_r2   initializer_rangebiasZzero_rI   r@   rO   Zfill_SeparableConv1DGroupedLinearLayer)rY   moduler<   r<   r=   _init_weights   s&   



z%ConvBertPreTrainedModel._init_weightsN)
rh   ri   rj   r   Zconfig_classr>   Zload_tf_weightsZbase_model_prefixZsupports_gradient_checkpointingr~   r<   r<   r<   r=   ro      s    ro   c                       6   e Zd ZdZ fddZdejdejfddZ  ZS )r{   zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    s~   t    tj|||||d dd| _tj||ddd| _tt|d| _	| jj
jjd|jd | jj
jjd|jd d S )Nr   F)kernel_sizegroupspaddingrz   r   )r   rz   rq   rr   )rG   rH   r   rv   	depthwise	pointwise	Parameterr+   rU   rz   rw   r0   rx   ry   )rY   r2   Zinput_filtersZoutput_filtersr   kwargsrZ   r<   r=   rH     s   
zSeparableConv1D.__init__hidden_statesr^   c                 C   s"   |  |}| |}|| j7 }|S N)r   r   rz   )rY   r   xr<   r<   r=   rg        


zSeparableConv1D.forward	rh   ri   rj   rk   rH   r+   Tensorrg   rn   r<   r<   rZ   r=   r{     s    r{   c                       sx   e Zd Z fddZdd Z				ddejdeej d	eej d
eej dee	 de
ejeej f fddZ  ZS )ConvBertSelfAttentionc                    s`  t    |j|j dkrt|dstd|j d|j d|j|j }|dk r1|j| _d| _n|| _|j| _|j| _|j| j dkrHtd|j| j d | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t||j| j	| j| _t
| j	| j| j | _t
|j| j	| _t
j| jdgt| jd d dgd	| _t
|j| _d S )
Nr   rK   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr   )r   r   )rG   rH   hidden_sizenum_attention_headsra   
ValueErrorZ
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   ru   querykeyr;   r{   key_conv_attn_layerconv_kernel_layerconv_out_layerZUnfoldintunfoldrQ   Zattention_probs_dropout_probrS   )rY   r2   Znew_num_attention_headsrZ   r<   r=   rH      s<   

zConvBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   r   r   r   )rV   r   r   viewr.   )rY   r   Znew_x_shaper<   r<   r=   transpose_for_scoresG  s   
z*ConvBertSelfAttention.transpose_for_scoresNFr   attention_mask	head_maskencoder_hidden_statesoutput_attentionsr^   c                 C   sV  |  |}|d}|d ur| |}| |}	n
| |}| |}	| |dd}
|
dd}
| |}| |}| |	}t|
|}| 	|}t
|d| jdg}tj|dd}| |}t
||d| jg}|dd d}tjj|| jdgd| jd d dgdd}|dd
|d| j| j}t
|d| j| jg}t||}t
|d| jg}t||dd}|t| j }|d ur|| }tjj|dd}| |}|d ur|| }t||}|dddd }t
||d| j| jg}t||gd}| d d | j| j d f }|j| }|r&||f}|S |f}|S )	Nr   r   r   r   dim)r   Zdilationr   Zstrider   )r   rV   r   r;   r   Z	transposer   r+   multiplyr   reshaper   Zsoftmaxr   r   
contiguousr/   r   Z
functionalr   r   matmulmathsqrtrS   r.   r   catr   )rY   r   r   r   r   r   Zmixed_query_layer
batch_sizeZmixed_key_layerZmixed_value_layerZmixed_key_conv_attn_layerZquery_layerZ	key_layerZvalue_layerZconv_attn_layerr   r   Zattention_scoresZattention_probsZcontext_layerZconv_outZnew_context_layer_shapeoutputsr<   r<   r=   rg   L  sh   











zConvBertSelfAttention.forwardNNNF)rh   ri   rj   rH   r   r+   r   r   rm   boolr   rg   rn   r<   r<   rZ   r=   r     s(    'r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )ConvBertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S NrA   )rG   rH   r   ru   r   r   rO   rP   rQ   rR   rS   rX   rZ   r<   r=   rH     s   
zConvBertSelfOutput.__init__r   input_tensorr^   c                 C   &   |  |}| |}| || }|S r   r   rS   rO   rY   r   r   r<   r<   r=   rg        

zConvBertSelfOutput.forwardrh   ri   rj   rH   r+   r   rg   rn   r<   r<   rZ   r=   r     s    $r   c                       sx   e Zd Z fddZdd Z				ddejdeej d	eej d
eej dee	 de
ejeej f fddZ  ZS )ConvBertAttentionc                    s*   t    t|| _t|| _t | _d S r   )rG   rH   r   rY   r   outputsetpruned_headsrX   rZ   r<   r=   rH     s   


zConvBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   rY   r   r   r   r   r   r   r;   r   r   r   union)rY   headsindexr<   r<   r=   prune_heads  s   zConvBertAttention.prune_headsNFr   r   r   r   r   r^   c           	      C   s8   |  |||||}| |d |}|f|dd   }|S )Nr   r   )rY   r   )	rY   r   r   r   r   r   Zself_outputsattention_outputr   r<   r<   r=   rg     s   zConvBertAttention.forwardr   )rh   ri   rj   rH   r   r+   r   r   rm   r   r   rg   rn   r<   r<   rZ   r=   r     s(    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )r|   c                    sj   t    || _|| _|| _| j| j | _| j| j | _tt	
| j| j| j| _tt	
|| _d S r   )rG   rH   
input_sizeoutput_sizer(   group_in_dimZgroup_out_dimr   r   r+   emptyrw   rz   )rY   r   r   r(   rZ   r<   r=   rH     s   
zGroupedLinearLayer.__init__r   r^   c                 C   sr   t | d }t|d| j| jg}|ddd}t|| j}|ddd}t||d| j	g}|| j
 }|S )Nr   r   r   r   )listrV   r+   r   r(   r   r.   r   rw   r   rz   )rY   r   r   r   r<   r<   r=   rg     s   
zGroupedLinearLayer.forwardr   r<   r<   rZ   r=   r|     s    
r|   c                       r   )ConvBertIntermediatec                    sf   t    |jdkrt|j|j| _nt|j|j|jd| _t	|j
tr-t|j
 | _d S |j
| _d S )Nr   r   r   r(   )rG   rH   r(   r   ru   r   intermediate_sizer   r|   rt   
hidden_actstrr   intermediate_act_fnrX   rZ   r<   r=   rH     s   

zConvBertIntermediate.__init__r   r^   c                 C   s   |  |}| |}|S r   )r   r   rY   r   r<   r<   r=   rg     s   

zConvBertIntermediate.forwardr   r<   r<   rZ   r=   r     s    r   c                       r   )ConvBertOutputc                    sd   t    |jdkrt|j|j| _nt|j|j|jd| _tj	|j|j
d| _	t|j| _d S )Nr   r   rA   )rG   rH   r(   r   ru   r   r   r   r|   rO   rP   rQ   rR   rS   rX   rZ   r<   r=   rH     s   

zConvBertOutput.__init__r   r   r^   c                 C   r   r   r   r   r<   r<   r=   rg     r   zConvBertOutput.forwardr   r<   r<   rZ   r=   r     s    $r   c                       s   e Zd Z fddZ					ddejdeej deej deej d	eej d
ee de	ejeej f fddZ
dd Z  ZS )ConvBertLayerc                    sn   t    |j| _d| _t|| _|j| _|j| _| jr+| js&t|  dt|| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is added)rG   rH   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr   intermediater   r   rX   rZ   r<   r=   rH     s   



zConvBertLayer.__init__NFr   r   r   r   encoder_attention_maskr   r^   c                 C   s   | j ||||d}|d }|dd  }	| jr<|d ur<t| ds'td|  d| |||||}
|
d }|	|
dd   }	t| j| j| j|}|f|	 }	|	S )N)r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   ra   AttributeErrorr   r   feed_forward_chunkr   r   )rY   r   r   r   r   r   r   Zself_attention_outputsr   r   Zcross_attention_outputslayer_outputr<   r<   r=   rg   &  s6   	


zConvBertLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rY   r   Zintermediate_outputr   r<   r<   r=   r   N  s   
z ConvBertLayer.feed_forward_chunk)NNNNF)rh   ri   rj   rH   r+   r   r   rm   r   r   rg   r   rn   r<   r<   rZ   r=   r     s.    
(r   c                       s   e Zd Z fddZ							ddejdeej deej d	eej d
eej dee dee dee de	e
ef fddZ  ZS )ConvBertEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r<   )r   ).0_r2   r<   r=   
<listcomp>X  s    z,ConvBertEncoder.__init__.<locals>.<listcomp>F)	rG   rH   r2   r   Z
ModuleListr)   r*   layergradient_checkpointingrX   rZ   r   r=   rH   U  s   
 
zConvBertEncoder.__init__NFTr   r   r   r   r   r   output_hidden_statesreturn_dictr^   c	              
   C   s  |rdnd }	|r
dnd }
|r| j jrdnd }t| jD ]I\}}|r&|	|f }	|d ur.|| nd }| jrC| jrC| |j||||||}n	|||||||}|d }|rd|
|d f }
| j jrd||d f }q|rl|	|f }	|s{tdd ||	|
|fD S t	||	|
|dS )Nr<   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r<   )r   vr<   r<   r=   	<genexpr>  s    z*ConvBertEncoder.forward.<locals>.<genexpr>)Zlast_hidden_stater   
attentionsZcross_attentions)
r2   r   	enumerater   r   ZtrainingZ_gradient_checkpointing_func__call__tupler   )rY   r   r   r   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsiZlayer_moduleZlayer_head_maskZlayer_outputsr<   r<   r=   rg   [  sV   



zConvBertEncoder.forward)NNNNFFT)rh   ri   rj   rH   r+   r   r   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r   T  s8    		

r   c                       r   )ConvBertPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )rG   rH   r   ru   r   r   rt   r   r   r   transform_act_fnrO   rP   rX   rZ   r<   r=   rH     s   
z(ConvBertPredictionHeadTransform.__init__r   r^   c                 C   s"   |  |}| |}| |}|S r   )r   r   rO   r   r<   r<   r=   rg     r   z'ConvBertPredictionHeadTransform.forwardr   r<   r<   rZ   r=   r     s    	r   c                       sJ   e Zd ZdZdef fddZ	ddejdeej	 dejfd	d
Z
  ZS )ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r2   c                    s   t    t|dd| _| jdkrtt | _t|dr<|j	r<t|dr1|j
r1|jdkr1|j}n|j}t|j|| _t|dd }|rHt|nt | _t | _t|drc|jdkrct|j| _t | _t|d	r{|jdkr}t|j| _d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   Zsummary_activationsummary_first_dropoutsummary_last_dropout)rG   rH   getattrr   NotImplementedErrorr   ZIdentitysummaryra   r   r   
num_labelsr   ru   r   
activationfirst_dropoutr   rQ   last_dropoutr   )rY   r2   Znum_classesZactivation_stringrZ   r<   r=   rH     s&   




z ConvBertSequenceSummary.__init__Nr   	cls_indexr^   c                 C   s  | j dkr|dddf }ne| j dkr|dddf }nW| j dkr(|jdd}nK| j d	krl|du rItj|d
ddddf |jd d tjd}n|dd}|d| d  |	df }|
d|d}n| j dkrst| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r   Nr   firstr   rs   r   r   r   .r   rE   )r   r   )r   rs   r+   Z	full_liker5   rW   r/   rT   r   rV   gathersqueezer   r   r   r   r   )rY   r   r   r   r<   r<   r=   rg     s.   



"




zConvBertSequenceSummary.forwardr   )rh   ri   rj   rk   r   rH   r+   rm   r   rl   rg   rn   r<   r<   rZ   r=   r     s    r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																		dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee dee dee deeef fddZ  ZS )ConvBertModelc                    sP   t  | t|| _|j|jkrt|j|j| _t	|| _
|| _|   d S r   )rG   rH   r?   rf   rK   r   r   ru   embeddings_projectr   encoderr2   	post_initrX   rZ   r<   r=   rH     s   

zConvBertModel.__init__c                 C   s   | j jS r   rf   rL   rY   r<   r<   r=   get_input_embeddings  s   z"ConvBertModel.get_input_embeddingsc                 C   s   || j _d S r   r  )rY   r;   r<   r<   r=   set_input_embeddings  s   z"ConvBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rY   Zheads_to_pruner   r   r<   r<   r=   _prune_heads"  s   zConvBertModel._prune_headsNr\   r   rD   rB   r   r]   r   r   r   r^   c
                 C   sr  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u rctj	|
|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| ||
}| || j j}| j||||d}t
| dr| |}| j||||||	d	}|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r`   rD   r_   )r\   rB   rD   r]   r  )r   r   r   r   r   )r2   r   r   use_return_dictr   Z%warn_if_padding_and_no_attention_maskrV   r`   r+   Zonesra   rf   rD   rT   rU   rW   Zget_extended_attention_maskZget_head_maskr*   r  r  )rY   r\   r   rD   rB   r   r]   r   r   r   rb   r   rc   r`   rd   re   Zextended_attention_maskr   r<   r<   r=   rg   *  sL   


	zConvBertModel.forward)	NNNNNNNNN)rh   ri   rj   rH   r	  r
  r  r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r    sH    	

r  c                       r   )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    s>   t    td| _tj|j|jd| _t|j	|j| _
d S )NZgelurA   )rG   rH   r   r   r   rO   rK   rP   ru   r   r   rX   rZ   r<   r=   rH   m  s   

z%ConvBertGeneratorPredictions.__init__generator_hidden_statesr^   c                 C   s"   |  |}| |}| |}|S r   )r   r   rO   )rY   r  r   r<   r<   r=   rg   t  s   


z$ConvBertGeneratorPredictions.forward)	rh   ri   rj   rk   rH   r+   rm   rg   rn   r<   r<   rZ   r=   r  j  s    r  c                       s   e Zd ZdgZ fddZdd Zdd Ze										dd	ee	j
 d
ee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee deeef fddZ  ZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                    s>   t  | t|| _t|| _t|j|j	| _
|   d S r   )rG   rH   r  rp   r  generator_predictionsr   ru   rK   rJ   generator_lm_headr  rX   rZ   r<   r=   rH     s
   

zConvBertForMaskedLM.__init__c                 C   s   | j S r   r  r  r<   r<   r=   get_output_embeddings  s   z)ConvBertForMaskedLM.get_output_embeddingsc                 C   s
   || _ d S r   r  )rY   rL   r<   r<   r=   set_output_embeddings  s   
z)ConvBertForMaskedLM.set_output_embeddingsNr\   r   rD   rB   r   r]   labelsr   r   r   r^   c                 C   s   |
dur|
n| j j}
| ||||||||	|
	}|d }| |}| |}d}|dur=t }||d| j j|d}|
sS|f|dd  }|durQ|f| S |S t	|||j
|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   losslogitsr   r   )r2   r  rp   r  r  r   r	   r   rJ   r   r   r   )rY   r\   r   rD   rB   r   r]   r  r   r   r   r  Zgenerator_sequence_outputZprediction_scoresr  loss_fctr   r<   r<   r=   rg     s8   

zConvBertForMaskedLM.forward
NNNNNNNNNN)rh   ri   rj   Z_tied_weights_keysrH   r  r  r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r  |  sN    
	

r  c                       r   )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                    sZ   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _|| _d S r   )rG   rH   r   ru   r   r   classifier_dropoutrR   rQ   rS   r   out_projr2   rY   r2   r  rZ   r<   r=   rH     s   

z#ConvBertClassificationHead.__init__r   r^   c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )rS   r   r   r2   r   r  )rY   r   r   r   r<   r<   r=   rg     s   



z"ConvBertClassificationHead.forwardr   r<   r<   rZ   r=   r    s    r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )Zcustom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )!ConvBertForSequenceClassificationc                    s:   t  | |j| _|| _t|| _t|| _|   d S r   )	rG   rH   r   r2   r  rp   r  
classifierr  rX   rZ   r<   r=   rH     s   

z*ConvBertForSequenceClassification.__init__Nr\   r   rD   rB   r   r]   r  r   r   r   r^   c                 C   sh  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur| j jdu rQ| jdkr7d| j _n| jdkrM|jtjksH|jtj	krMd| j _nd| j _| j jdkrot
 }| jdkri|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rD   rB   r   r]   r   r   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   r  )r2   r  rp   r"  Zproblem_typer   rF   r+   rW   r   r
   r  r	   r   r   r   r   r   rY   r\   r   rD   rB   r   r]   r  r   r   r   r   sequence_outputr  r  r  r   r<   r<   r=   rg     sT   


"


z)ConvBertForSequenceClassification.forwardr  )rh   ri   rj   rH   r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r!    sH    
	

r!  c                       r   )ConvBertForMultipleChoicec                    s<   t  | t|| _t|| _t|jd| _	| 
  d S )Nr   )rG   rH   r  rp   r   sequence_summaryr   ru   r   r"  r  rX   rZ   r<   r=   rH   ;  s
   

z"ConvBertForMultipleChoice.__init__Nr\   r   rD   rB   r   r]   r  r   r   r   r^   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r   r#  r   r  )r2   r  r5   r   rV   rp   r'  r"  r	   r   r   r   )rY   r\   r   rD   rB   r   r]   r  r   r   r   Znum_choicesr   r%  Zpooled_outputr  Zreshaped_logitsr  r  r   r<   r<   r=   rg   E  sL   -


z!ConvBertForMultipleChoice.forwardr  )rh   ri   rj   rH   r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r&  9  sH    
	

r&  c                       r   )ConvBertForTokenClassificationc                    s^   t  | |j| _t|| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r   )rG   rH   r   r  rp   r  rR   r   rQ   rS   ru   r   r"  r  r  rZ   r<   r=   rH     s   
z'ConvBertForTokenClassification.__init__Nr\   r   rD   rB   r   r]   r  r   r   r   r^   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr#  r   r   r   r  )r2   r  rp   rS   r"  r	   r   r   r   r   r   r$  r<   r<   r=   rg     s8   

z&ConvBertForTokenClassification.forwardr  )rh   ri   rj   rH   r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r(    sH    	

r(  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eef fddZ  ZS )ConvBertForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
rG   rH   r   r  rp   r   ru   r   
qa_outputsr  rX   rZ   r<   r=   rH     s
   
z%ConvBertForQuestionAnswering.__init__Nr\   r   rD   rB   r   r]   start_positionsend_positionsr   r   r   r^   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr#  r   r   r   r   )Zignore_indexr   )r  start_logits
end_logitsr   r   )r2   r  rp   r*  splitr  r   r   rV   clampr	   r   r   r   )rY   r\   r   rD   rB   r   r]   r+  r,  r   r   r   r   r%  r  r-  r.  Z
total_lossZignored_indexr  Z
start_lossZend_lossr   r<   r<   r=   rg     sP   






z$ConvBertForQuestionAnswering.forward)NNNNNNNNNNN)rh   ri   rj   rH   r   r   r+   rl   rm   r   r   r   r   rg   rn   r<   r<   rZ   r=   r)    sN    
	

r)  )	r  r&  r)  r!  r(  r   r  ro   r>   )Ark   r   r#   operatorr   typingr   r   r   r   r+   Ztorch.utils.checkpointr   Ztorch.nnr   r	   r
   Zactivationsr   r   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   Zconfiguration_convbertr   Z
get_loggerrh   r!   r>   Moduler?   ro   r{   r   r   r   r|   r   r   r   r   r   r   r  r  r  r  r!  r&  r(  r)  __all__r<   r<   r<   r=   <module>   sb    
|< -=Ec[KShEM