o
    Zh                     @   sP  d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' e(e)Z*G dd dej+Z,G dd dej+Z-G dd dej+Z.G dd dej+Z/G dd dej+Z0G dd dej+Z1G dd dej+Z2G dd dej+Z3G dd  d ej+Z4eG d!d" d"eZ5eG d#d$ d$e5Z6eG d%d& d&e5Z7G d'd( d(ej+Z8ed)d*G d+d, d,e5Z9eG d-d. d.e5Z:eG d/d0 d0e5Z;G d1d2 d2ej+Z<eG d3d4 d4e5Z=d8d5d6Z>g d7Z?dS )9zPyTorch I-BERT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    |j| _d| _d| _d| _d| _d| _t|j	|j
|j| j| jd| _t|j|j
| j| jd| _| jdt|jdd	d
 t|dd| _|j| _t|j|j
| j| j| jd| _t| j| jd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t !|j"| _#d S )N             )padding_idx
weight_bit
quant_mode)r%   r&   position_ids)r   F)
persistentposition_embedding_typeabsoluter&   epsZ
output_bitr&   force_dequant)$super__init__r&   Zembedding_bitZembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizeZpad_token_idword_embeddingsZtype_vocab_sizetoken_type_embeddingsZregister_buffertorcharangeZmax_position_embeddingsexpandgetattrr*   r$   position_embeddingsr   embeddings_act1Zembeddings_act2r   layer_norm_epsr/   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__ W/var/www/auris/lib/python3.10/site-packages/transformers/models/ibert/modeling_ibert.pyr1   5   sP   
	zIBertEmbeddings.__init__Nr   c                 C   s  |d u r|d urt || j||j}n| |}|d ur"| }n| d d }|d u r9tj|tj| j	jd}|d u rE| 
|\}}nd }| |\}}	| j||||	d\}
}| jdkrp| |\}}| j|
|||d\}
}| |
|\}
}| |
}
| |
|\}
}|
|fS )Nr(   dtypedeviceidentityZidentity_scaling_factorr+   )"create_position_ids_from_input_idsr$   torN   &create_position_ids_from_inputs_embedssizer9   zeroslongr'   r7   r8   r>   r*   r=   r@   rD   rA   )rF   	input_idstoken_type_idsr'   inputs_embedspast_key_values_lengthinput_shapeZinputs_embeds_scaling_factorr8   Z$token_type_embeddings_scaling_factor
embeddingsZembeddings_scaling_factorr=   Z"position_embeddings_scaling_factorrJ   rJ   rK   forwardi   sF   





zIBertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr(   r   rL   r   )rT   r9   r:   r$   rV   rN   Z	unsqueezer;   )rF   rY   r[   Zsequence_lengthr'   rJ   rJ   rK   rS      s   	z6IBertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r1   r]   rS   __classcell__rJ   rJ   rH   rK   r   0   s    5
/r   c                       4   e Zd Z fddZdd Z			d	ddZ  ZS )
IBertSelfAttentionc              	      sv  t    |j|j dkrt|dstd|j d|j d|j| _d| _d| _d| _	|j| _t
|j|j | _| j| j | _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t|j| _t|dd| _| jdkrtdt| j	| j|jd| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r    r#   Tbiasr%   bias_bitr&   Zper_channelr,   r*   r+   zDI-BERT only supports 'absolute' for `config.position_embedding_type`r&   r/   )r0   r1   r6   num_attention_headshasattr
ValueErrorr&   r%   rh   r2   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrA   r   rB   Zattention_probs_dropout_probrD   r<   r*   r   r/   softmaxrE   rH   rJ   rK   r1      sd   

		
zIBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr(   r      r   r	   )rT   rj   rn   viewpermute)rF   xZnew_x_shaperJ   rJ   rK   transpose_for_scores   s   
z'IBertSelfAttention.transpose_for_scoresNFc                 C   s  |  ||\}}| ||\}}	| ||\}
}| ||\}}| ||	\}}| |
|\}}| |}| |}| |}t||	dd}t
| j}|| }| jr]|| | }nd }|d urg|| }| ||\}}| |}|d ur||| }t||}|d ur|| }nd }|dddd }| d d | jf }|j| }| ||\}}|r||fn|f}|r||fn|f}||fS )Nr(   r   rw   r   r	   )rp   rq   rr   rs   rt   ru   r{   r9   matmulZ	transposemathsqrtrn   r&   rv   rD   ry   
contiguousrT   ro   rx   rA   )rF   hidden_stateshidden_states_scaling_factorattention_mask	head_maskoutput_attentionsZmixed_query_layerZ mixed_query_layer_scaling_factorZmixed_key_layerZmixed_key_layer_scaling_factorZmixed_value_layerZ mixed_value_layer_scaling_factorZquery_layerZquery_layer_scaling_factorZ	key_layerZkey_layer_scaling_factorZvalue_layerZvalue_layer_scaling_factorZattention_scoresscaleZattention_scores_scaling_factorZattention_probsZattention_probs_scaling_factorZcontext_layerZcontext_layer_scaling_factorZnew_context_layer_shapeoutputsZoutput_scaling_factorrJ   rJ   rK   r]      sV   	






zIBertSelfAttention.forwardNNF)r^   r_   r`   r1   r{   r]   rb   rJ   rJ   rH   rK   rd      s    :	rd   c                       $   e Zd Z fddZdd Z  ZS )IBertSelfOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j	d| j| j| jdd| _
t| j| jd| _t|j	|j| j| j|jd| _t| j| jd| _t|j| _d S Nr    r#   r"   Trf   r,   r-   )r0   r1   r&   r2   r%   rh   r3   r4   r   r6   denser   ln_input_actr   r?   r/   r@   rA   r   rB   rC   rD   rE   rH   rJ   rK   r1   9  4   
	zIBertSelfOutput.__init__c                 C   X   |  ||\}}| |}| j||||d\}}| ||\}}| ||\}}||fS NrO   r   rD   r   r@   rA   rF   r   r   Zinput_tensorZinput_tensor_scaling_factorrJ   rJ   rK   r]   V     

zIBertSelfOutput.forwardr^   r_   r`   r1   r]   rb   rJ   rJ   rH   rK   r   8      r   c                       rc   )
IBertAttentionc                    s2   t    |j| _t|| _t|| _t | _d S N)	r0   r1   r&   rd   rF   r   outputsetpruned_headsrE   rH   rJ   rK   r1   h  s
   


zIBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   dim)lenr   rF   rj   rn   r   r   rp   rq   rr   r   r   ro   union)rF   headsindexrJ   rJ   rK   prune_headso  s   zIBertAttention.prune_headsNFc                 C   s^   |  |||||\}}| |d |d ||\}}	|f|dd   }
|	f|dd   }|
|fS )Nr   r   )rF   r   )rF   r   r   r   r   r   Zself_outputsZself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   Zoutputs_scaling_factorrJ   rJ   rK   r]     s   zIBertAttention.forwardr   )r^   r_   r`   r1   r   r]   rb   rJ   rJ   rH   rK   r   g  s    r   c                       r   )IBertIntermediatec              	      s   t    |j| _d| _d| _d| _t|j|jd| j| j| jdd| _	|j
dkr,tdt| j|jd| _t| j| jd| _d S )	Nr    r#   Trf   r
   z3I-BERT only supports 'gelu' for `config.hidden_act`ri   r,   )r0   r1   r&   r2   r%   rh   r   r6   intermediate_sizer   Z
hidden_actrl   r   r/   intermediate_act_fnr   rA   rE   rH   rJ   rK   r1     s$   

	zIBertIntermediate.__init__c                 C   s8   |  ||\}}| ||\}}| ||\}}||fS r   )r   r   rA   )rF   r   r   rJ   rJ   rK   r]     s   zIBertIntermediate.forwardr   rJ   rJ   rH   rK   r     s    r   c                       r   )IBertOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j
d| j| j| jdd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t|j| _d S r   )r0   r1   r&   r2   r%   rh   r3   r4   r   r   r6   r   r   r   r   r?   r/   r@   rA   r   rB   rC   rD   rE   rH   rJ   rK   r1     r   zIBertOutput.__init__c                 C   r   r   r   r   rJ   rJ   rK   r]     r   zIBertOutput.forwardr   rJ   rJ   rH   rK   r     r   r   c                       s4   e Zd Z fddZ			d	ddZdd Z  ZS )

IBertLayerc                    sd   t    |j| _d| _d| _t|| _t|| _t	|| _
t| j| jd| _t| j| jd| _d S )Nr    r   r,   )r0   r1   r&   r2   Zseq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrE   rH   rJ   rK   r1     s   



zIBertLayer.__init__NFc                 C   sR   | j |||||d\}}|d }|d }	|dd  }
| ||	\}}|f|
 }
|
S )N)r   r   r   )r   feed_forward_chunk)rF   r   r   r   r   r   Zself_attention_outputsZ%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factorrJ   rJ   rK   r]     s   

zIBertLayer.forwardc                 C   sL   |  ||\}}| ||\}}| ||\}}| ||||\}}||fS r   )r   r   r   r   )rF   r   r   Zintermediate_outputZ"intermediate_output_scaling_factorr   r   rJ   rJ   rK   r     s   zIBertLayer.feed_forward_chunkr   )r^   r_   r`   r1   r]   r   rb   rJ   rJ   rH   rK   r     s    
r   c                       s0   e Zd Z fddZ					dddZ  ZS )	IBertEncoderc                    s<   t     | _ j| _t fddt jD | _d S )Nc                    s   g | ]}t  qS rJ   )r   ).0_rG   rJ   rK   
<listcomp>(  s    z)IBertEncoder.__init__.<locals>.<listcomp>)	r0   r1   rG   r&   r   Z
ModuleListrangenum_hidden_layerslayerrE   rH   r   rK   r1   $  s   
$zIBertEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }	d }
d }t | jD ]*\}}|r ||f }|d ur(|| nd }||||||}|d }|r?|	|d f }	q|rG||f }|sWtdd ||||	|
fD S t||||	|
dS )NrJ   r   r   c                 s   s    | ]	}|d ur|V  qd S r   rJ   )r   vrJ   rJ   rK   	<genexpr>O  s    z'IBertEncoder.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesr   
attentionscross_attentions)	enumerater   tupler   )rF   r   r   r   r   r   output_hidden_statesreturn_dictZall_hidden_statesZall_self_attentionsZall_cross_attentionsZnext_decoder_cacheiZlayer_moduleZlayer_head_maskZlayer_outputsrJ   rJ   rK   r]   *  sJ   


zIBertEncoder.forward)NNFFTr   rJ   rJ   rH   rK   r   #  s    
r   c                       r   )IBertPoolerc                    s2   t    |j| _t|j|j| _t | _d S r   )	r0   r1   r&   r   Linearr6   r   ZTanh
activationrE   rH   rJ   rK   r1   d  s   
zIBertPooler.__init__c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )rF   r   Zfirst_token_tensorpooled_outputrJ   rJ   rK   r]   j  s   

zIBertPooler.forwardr   rJ   rJ   rH   rK   r   c  s    r   c                   @   s&   e Zd ZeZdZdd ZdddZdS )IBertPreTrainedModelibertc                 C   s   t |ttjfr"|jjjd| jjd |j	dur |j	j
  dS dS t |ttjfrG|jjjd| jjd |jdurE|jj|j 
  dS dS t |ttjfr^|j	j
  |jjd dS t |trk|j	j
  dS dS )zInitialize the weightsg        )meanZstdNg      ?)
isinstancer   r   r   weightdataZnormal_rG   Zinitializer_rangerg   Zzero_r   Z	Embeddingr$   r   r@   Zfill_IBertLMHead)rF   modulerJ   rJ   rK   _init_weightsx  s    


z"IBertPreTrainedModel._init_weightsNc                 C   s   t d)Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rF   Znew_num_tokensrJ   rJ   rK   resize_token_embeddings     z,IBertPreTrainedModel.resize_token_embeddingsr   )r^   r_   r`   r   Zconfig_classZbase_model_prefixr   r   rJ   rJ   rJ   rK   r   s  s
    r   c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Ze									dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeee
j f fddZ  ZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    Tc                    sL   t  | || _|j| _t|| _t|| _|rt|nd| _	| 
  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r0   r1   rG   r&   r   r\   r   encoderr   pooler	post_init)rF   rG   add_pooling_layerrH   rJ   rK   r1     s   

zIBertModel.__init__c                 C      | j jS r   r\   r7   rF   rJ   rJ   rK   get_input_embeddings  r   zIBertModel.get_input_embeddingsc                 C   s   || j _d S r   r   )rF   rr   rJ   rJ   rK   set_input_embeddings  s   zIBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )rF   Zheads_to_pruner   r   rJ   rJ   rK   _prune_heads  s   zIBertModel._prune_headsNrW   r   rX   r'   r   rY   r   r   r   returnc
              	   C   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u retj	||f|d}|d u rrtj
|
tj|d}| ||
}| || j j}| j||||d\}}| j|||||||	d}|d }| jd ur| |nd }|	s||f|d	d   S t|||j|j|j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embeds)rN   rL   )rW   r'   rX   rY   )r   r   r   r   r   r   r   )r   Zpooler_outputr   r   r   r   )rG   r   r   use_return_dictrl   Z%warn_if_padding_and_no_attention_maskrT   rN   r9   ZonesrU   rV   Zget_extended_attention_maskZget_head_maskr   r\   r   r   r   r   r   r   r   )rF   rW   r   rX   r'   r   rY   r   r   r   r[   Z
batch_sizeZ
seq_lengthrN   Zextended_attention_maskZembedding_outputZembedding_output_scaling_factorZencoder_outputssequence_outputr   rJ   rJ   rK   r]     s^   

	zIBertModel.forward)T)	NNNNNNNNN)r^   r_   r`   ra   r1   r   r   r   r   r   r9   
LongTensorFloatTensorboolr   r   r   r]   rb   rJ   rJ   rH   rK   r     sJ    		
r   c                       s   e Zd ZddgZ fddZdd Zdd Ze																				dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee deeee	j f fddZ  ZS )IBertForMaskedLMzlm_head.decoder.biaszlm_head.decoder.weightc                    s0   t  | t|dd| _t|| _|   d S NF)r   )r0   r1   r   r   r   lm_headr   rE   rH   rJ   rK   r1     s   
zIBertForMaskedLM.__init__c                 C   r   r   )r   decoderr   rJ   rJ   rK   get_output_embeddings  r   z&IBertForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )r   r   rg   )rF   Znew_embeddingsrJ   rJ   rK   set_output_embeddings  s   z&IBertForMaskedLM.set_output_embeddingsNrW   r   rX   r'   r   rY   labelsr   r   r   r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   rX   r'   r   rY   r   r   r   r   r(   rw   losslogitsr   r   )
rG   r   r   r   r   rx   r5   r   r   r   )rF   rW   r   rX   r'   r   rY   r   r   r   r   r   r   Zprediction_scoresZmasked_lm_lossloss_fctr   rJ   rJ   rK   r]     s6   
zIBertForMaskedLM.forward
NNNNNNNNNN)r^   r_   r`   Z_tied_weights_keysr1   r   r   r   r   r9   r   r   r   r   r   r   r]   rb   rJ   rJ   rH   rK   r     sN    		
r   c                       s2   e Zd ZdZ fddZdd Zd
dd	Z  ZS )r   z)I-BERT Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S )N)r.   )r0   r1   r   r   r6   r   r@   r?   
layer_normr5   r   	Parameterr9   rU   rg   rE   rH   rJ   rK   r1   S  s   
zIBertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r
   r   r   )rF   featureskwargsrz   rJ   rJ   rK   r]   \  s
   


zIBertLMHead.forwardr   Nc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r   rg   rN   typer   rJ   rJ   rK   _tie_weightsf  s   zIBertLMHead._tie_weights)r   N)r^   r_   r`   ra   r1   r]   r   rb   rJ   rJ   rH   rK   r   P  s
    	
r   z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )Zcustom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForSequenceClassificationc                    s8   t  | |j| _t|dd| _t|| _|   d S r   )r0   r1   
num_labelsr   r   IBertClassificationHead
classifierr   rE   rH   rJ   rK   r1   v  s
   
z'IBertForSequenceClassification.__init__NrW   r   rX   r'   r   rY   r   r   r   r   r   c                 C   sh  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur| j jdu rQ| jdkr7d| j _n| jdkrM|jtjksH|jtj	krMd| j _nd| j _| j jdkrot
 }| jdkri|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr(   rw   r   )rG   r   r   r   Zproblem_typer   rM   r9   rV   rm   r   squeezer   rx   r   r   r   r   rF   rW   r   rX   r'   r   rY   r   r   r   r   r   r   r   r   r   r   rJ   rJ   rK   r]     sT   


"


z&IBertForSequenceClassification.forwardr   )r^   r_   r`   r1   r   r   r9   r   r   r   r   r   r   r]   rb   rJ   rJ   rH   rK   r   o  sH    
	
r   c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r0   r1   r   r   r   rB   rC   rD   r   r6   r   r   rE   rH   rJ   rK   r1     s
   
zIBertForMultipleChoice.__init__NrW   rX   r   r   r'   r   rY   r   r   r   r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r(   r|   )r'   rX   r   r   rY   r   r   r   rw   r   )rG   r   shaperx   rT   r   rD   r   r   r   r   r   )rF   rW   rX   r   r   r'   r   rY   r   r   r   Znum_choicesZflat_input_idsZflat_position_idsZflat_token_type_idsZflat_attention_maskZflat_inputs_embedsr   r   r   Zreshaped_logitsr   r   r   rJ   rJ   rK   r]     sL   ,


zIBertForMultipleChoice.forwardr   )r^   r_   r`   r1   r   r   r9   r   r   r   r   r   r   r]   rb   rJ   rJ   rH   rK   r    sH    
	
r  c                       r   )IBertForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S r   )r0   r1   r   r   r   r   rB   rC   rD   r   r6   r   r   rE   rH   rJ   rK   r1   /  s   z$IBertForTokenClassification.__init__NrW   r   rX   r'   r   rY   r   r   r   r   r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr   r   r(   rw   r   )rG   r   r   rD   r   r   rx   r   r   r   r   r   rJ   rJ   rK   r]   :  s8   

z#IBertForTokenClassification.forwardr   )r^   r_   r`   r1   r   r   r9   r   r   r   r   r   r   r]   rb   rJ   rJ   rH   rK   r  -  sH    	
r  c                       s(   e Zd ZdZ fddZdd Z  ZS )r   z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r   )r0   r1   r   r   r6   r   rB   rC   rD   r   out_projrE   rH   rJ   rK   r1   s  s   
z IBertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )rD   r   r9   tanhr  )rF   r   r   r   rJ   rJ   rK   r]   y  s   




zIBertClassificationHead.forward)r^   r_   r`   ra   r1   r]   rb   rJ   rJ   rH   rK   r   p  s    r   c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r   )
r0   r1   r   r   r   r   r   r6   
qa_outputsr   rE   rH   rJ   rK   r1     s
   z"IBertForQuestionAnswering.__init__NrW   r   rX   r'   r   rY   start_positionsend_positionsr   r   r   r   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr   r   r   r(   r   )Zignore_indexrw   )r   start_logits
end_logitsr   r   )rG   r   r   r  splitr   r   r   rT   clampr   r   r   r   )rF   rW   r   rX   r'   r   rY   r  r	  r   r   r   r   r   r   r
  r  Z
total_lossZignored_indexr   Z
start_lossZend_lossr   rJ   rJ   rK   r]     sP   






z!IBertForQuestionAnswering.forward)NNNNNNNNNNN)r^   r_   r`   r1   r   r   r9   r   r   r   r   r   r   r]   rb   rJ   rJ   rH   rK   r    sN    
	
r  c                 C   s6   |  | }tj|dd|| | }| | S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r   )nerm   r9   ZcumsumZtype_asrV   )rW   r$   rZ   maskZincremental_indicesrJ   rJ   rK   rQ     s   rQ   )r   r  r  r   r  r   r   )r   )@ra   r~   typingr   r   r   r9   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   Zconfiguration_ibertr   Zquant_modulesr   r   r   r   r   r   Z
get_loggerr^   loggerModuler   rd   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r  rQ   __all__rJ   rJ   rJ   rK   <module>   sZ   $	 
z /1"/:@xHQfB
M