o
    Zh                     @   s<  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ e"%e&Z'dd Z(G dd de
j)Z*e
j+e*dZ,G dd de
j)Z-G dd de
j)Z.G dd de
j)Z/G dd de
j)Z0G dd de
j)Z1G dd de
j)Z2G dd  d e
j)Z3G d!d" d"e
j)Z4G d#d$ d$e
j)Z5G d%d& d&e
j)Z6G d'd( d(e
j)Z7G d)d* d*e
j)Z8G d+d, d,e
j)Z9G d-d. d.e
j)Z:G d/d0 d0e
j)Z;G d1d2 d2e
j)Z<G d3d4 d4e
j)Z=G d5d6 d6e
j)Z>e!G d7d8 d8eZ?eG d9d: d:e Z@e!G d;d< d<e?ZAe!d=d>G d?d@ d@e?ZBe!G dAdB dBe?ZCG dCdD dDe
j)ZDe!dEd>G dFdG dGe?ZEe!dHd>G dIdJ dJe?ZFe!G dKdL dLe?ZGe!G dMdN dNe?ZHe!G dOdP dPe?ZIg dQZJdS )R    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )MobileBertConfigc                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
d}
tdd |
D rt	dd|
  q\| }|
D ]~}|d|r|d|}n|g}|d dks|d dkrt|d}nI|d dks|d dkrt|d}n7|d dkrt|d}n+|d dkrt|d}nz	t||d }W n ty   t	dd|
  Y qw t|dkrt|d }|| }q|d d d!kr%t|d}n
|dkr/||}z|j|jksDJ d"|j d#|j d$W n ty^ } z| j|j|jf7  _ d}~ww t	d%|
  t||_q\| S )&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape Z	ffn_layerffnZFakeLayerNorm	LayerNormZextra_output_weightszdense/kernelZbert
mobilebert/c                 s   s    | ]}|d v V  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN ).0nr    r    a/var/www/auris/lib/python3.10/site-packages/transformers/models/mobilebert/modeling_mobilebert.py	<genexpr>V   s
    
z0load_tf_weights_in_mobilebert.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)ZkernelgammaweightZoutput_biasbetabiasZoutput_weightsZsquad
classifier   r   iZ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipreplacesplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargstorchZ
from_numpydata)modelconfigZtf_checkpoint_pathr+   nptfZtf_pathZ	init_varsnamesZarraysnamerA   arrayZpointerZm_nameZscope_namesnumer    r    r#   load_tf_weights_in_mobilebert5   s   



rO   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	NoNormNc                    s2   t    tt|| _tt|| _d S N)	super__init__r   	ParameterrD   zerosr(   onesr&   )selfZ	feat_sizeeps	__class__r    r#   rS      s   
zNoNorm.__init__input_tensorreturnc                 C   s   || j  | j S rQ   )r&   r(   )rW   r[   r    r    r#   forward   s   zNoNorm.forwardrQ   __name__
__module____qualname__rS   rD   Tensorr]   __classcell__r    r    rY   r#   rP      s    rP   )
layer_normZno_normc                       sb   e Zd ZdZ fddZ				ddeej deej deej deej d	ej	f
d
dZ
  ZS )MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    |j| _|j| _|j| _tj|j|j|jd| _	t|j
|j| _t|j|j| _| jr4dnd}| j| }t||j| _t|j |j| _t|j| _| jdt|j
ddd d S )N)padding_idxr
   r   position_ids)r   F)
persistent)rR   rS   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizeZpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_typer   Dropouthidden_dropout_probdropoutZregister_bufferrD   Zarangeexpand)rW   rG   Zembed_dim_multiplierZembedded_input_sizerY   r    r#   rS      s   


zMobileBertEmbeddings.__init__N	input_idstoken_type_idsrg   inputs_embedsr\   c           
      C   s*  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jrktjt	j
j|d d dd f g ddd|t	j
j|d d d df g dddgdd	}| jst| j| jkry| |}| |}| |}|| | }	| |	}	| |	}	|	S )
Nrh   r   dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r   r*   dim)sizerg   rD   rU   longr   ro   rj   catr   
functionalpadrk   rl   rs   rp   rq   r   rx   )
rW   rz   r{   rg   r|   input_shapeZ
seq_lengthrp   rq   
embeddingsr    r    r#   r]      s4   

$$




zMobileBertEmbeddings.forward)NNNN)r_   r`   ra   __doc__rS   r   rD   
LongTensorFloatTensorrb   r]   rc   r    r    rY   r#   re      s$    re   c                       sn   e Zd Z fddZdd Z			ddejdejdejd	eej d
eej dee	 de
ej fddZ  ZS )MobileBertSelfAttentionc                    s   t    |j| _t|j|j | _| j| j | _t|j| j| _	t|j| j| _
t|jr3|jn|j| j| _t|j| _d S rQ   )rR   rS   num_attention_headsr?   true_hidden_sizeattention_head_sizeall_head_sizer   rr   querykeyuse_bottleneck_attentionrl   r   rv   Zattention_probs_dropout_probrx   rW   rG   rY   r    r#   rS      s   
z MobileBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nrh   r   r*   r   r
   )r   r   r   viewpermute)rW   xZnew_x_shaper    r    r#   transpose_for_scores   s   
z,MobileBertSelfAttention.transpose_for_scoresNquery_tensor
key_tensorvalue_tensorattention_mask	head_maskoutput_attentionsr\   c                 C   s   |  |}| |}| |}	| |}
| |}| |	}t|
|dd}|t| j	 }|d ur8|| }t
jj|dd}| |}|d urM|| }t||}|dddd }| d d | jf }||}|rv||f}|S |f}|S )Nrh   r   r   r*   r   r
   )r   r   r   r   rD   matmulr@   mathsqrtr   r   r   Zsoftmaxrx   r   
contiguousr   r   r   )rW   r   r   r   r   r   r   Zmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr    r    r#   r]      s,   
	






zMobileBertSelfAttention.forwardNNN)r_   r`   ra   rS   r   rD   rb   r   r   boolr   r]   rc   r    r    rY   r#   r      s*    
r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MobileBertSelfOutputc                    sX   t    |j| _t|j|j| _t|j |j|j	d| _
| js*t|j| _d S d S NrX   )rR   rS   use_bottleneckr   rr   r   densert   ru   layer_norm_epsr   rv   rw   rx   r   rY   r    r#   rS     s   
zMobileBertSelfOutput.__init__hidden_statesresidual_tensorr\   c                 C   s,   |  |}| js| |}| || }|S rQ   )r   r   rx   r   rW   r   r   layer_outputsr    r    r#   r]     s
   

zMobileBertSelfOutput.forwardr^   r    r    rY   r#   r     s    $r   c                       st   e Zd Z fddZdd Z			ddejdejdejd	ejd
eej deej dee	 de
ej fddZ  ZS )MobileBertAttentionc                    s*   t    t|| _t|| _t | _d S rQ   )rR   rS   r   rW   r   outputsetpruned_headsr   rY   r    r#   rS   (  s   


zMobileBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )r>   r   rW   r   r   r   r   r   r   r   r   r   r   union)rW   headsindexr    r    r#   prune_heads.  s   zMobileBertAttention.prune_headsNr   r   r   layer_inputr   r   r   r\   c                 C   s:   |  ||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )rW   r   )rW   r   r   r   r   r   r   r   Zself_outputsattention_outputr   r    r    r#   r]   @  s   

zMobileBertAttention.forwardr   )r_   r`   ra   rS   r   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   r   '  s.    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MobileBertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rQ   )rR   rS   r   rr   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   rY   r    r#   rS   Z  s
   
zMobileBertIntermediate.__init__r   r\   c                 C   s   |  |}| |}|S rQ   )r   r   rW   r   r    r    r#   r]   b     

zMobileBertIntermediate.forwardr^   r    r    rY   r#   r   Y  s    r   c                       r   )OutputBottleneckc                    sF   t    t|j|j| _t|j |j|j	d| _
t|j| _d S r   )rR   rS   r   rr   r   rl   r   rt   ru   r   r   rv   rw   rx   r   rY   r    r#   rS   i  s   
zOutputBottleneck.__init__r   r   r\   c                 C   s&   |  |}| |}| || }|S rQ   )r   rx   r   r   r    r    r#   r]   o  s   

zOutputBottleneck.forwardr^   r    r    rY   r#   r   h  s    $r   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	MobileBertOutputc                    s\   t    |j| _t|j|j| _t|j	 |j| _
| js't|j| _d S t|| _d S rQ   )rR   rS   r   r   rr   r   r   r   rt   ru   r   rv   rw   rx   r   
bottleneckr   rY   r    r#   rS   w  s   
zMobileBertOutput.__init__intermediate_statesresidual_tensor_1residual_tensor_2r\   c                 C   sJ   |  |}| js| |}| || }|S | || }| ||}|S rQ   )r   r   rx   r   r   )rW   r   r   r   layer_outputr    r    r#   r]     s   

zMobileBertOutput.forwardr^   r    r    rY   r#   r   v  s    
r   c                       r   )BottleneckLayerc                    8   t    t|j|j| _t|j |j|j	d| _
d S r   )rR   rS   r   rr   rl   Zintra_bottleneck_sizer   rt   ru   r   r   r   rY   r    r#   rS        
zBottleneckLayer.__init__r   r\   c                 C   s   |  |}| |}|S rQ   r   r   )rW   r   r   r    r    r#   r]     r   zBottleneckLayer.forwardr^   r    r    rY   r#   r         r   c                       s6   e Zd Z fddZdejdeej fddZ  ZS )
Bottleneckc                    s<   t    |j| _|j| _t|| _| jrt|| _d S d S rQ   )rR   rS   key_query_shared_bottleneckr   r   input	attentionr   rY   r    r#   rS     s   

zBottleneck.__init__r   r\   c                 C   sB   |  |}| jr|fd S | jr| |}||||fS ||||fS )N   )r   r   r   r   )rW   r   Zbottlenecked_hidden_statesZshared_attention_inputr    r    r#   r]     s   


zBottleneck.forward	r_   r`   ra   rS   rD   rb   r   r]   rc   r    r    rY   r#   r     s    "r   c                       r   )	FFNOutputc                    r   r   )rR   rS   r   rr   r   r   r   rt   ru   r   r   r   rY   r    r#   rS     r   zFFNOutput.__init__r   r   r\   c                 C   s   |  |}| || }|S rQ   r   r   r    r    r#   r]     s   
zFFNOutput.forwardr^   r    r    rY   r#   r     s    $r   c                       r   )FFNLayerc                    s"   t    t|| _t|| _d S rQ   )rR   rS   r   intermediater   r   r   rY   r    r#   rS     s   

zFFNLayer.__init__r   r\   c                 C   s   |  |}| ||}|S rQ   )r   r   )rW   r   intermediate_outputr   r    r    r#   r]     s   
zFFNLayer.forwardr^   r    r    rY   r#   r     r   r   c                       sZ   e Zd Z fddZ			ddejdeej deej dee de	ej f
d	d
Z
  ZS )MobileBertLayerc                    s~   t     j| _ j| _t | _t | _t | _	| jr$t
 | _ jdkr=t fddt jd D | _d S d S )Nr   c                       g | ]}t  qS r    )r   r!   _rG   r    r#   
<listcomp>      z,MobileBertLayer.__init__.<locals>.<listcomp>)rR   rS   r   num_feedforward_networksr   r   r   r   r   r   r   r   r   
ModuleListranger   r   rY   r   r#   rS     s   





(zMobileBertLayer.__init__Nr   r   r   r   r\   c              	   C   s   | j r| |\}}}}n	|gd \}}}}| j|||||||d}	|	d }
|
f}|	dd  }| jdkrGt| jD ]\}}||
}
||
f7 }q9| |
}| ||
|}|f| t	d|||||
|f | }|S )Nr   )r   r   r   i  )
r   r   r   r   	enumerater   r   r   rD   Ztensor)rW   r   r   r   r   r   r   r   r   Zself_attention_outputsr   sr   iZ
ffn_moduler   r   r    r    r#   r]     sJ   	

zMobileBertLayer.forwardr   )r_   r`   ra   rS   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   r     s     r   c                       sp   e Zd Z fddZ					ddejdeej deej d	ee d
ee dee de	e
ef fddZ  ZS )MobileBertEncoderc                    s.   t    t fddt jD | _d S )Nc                    r   r    )r   r   r   r    r#   r     r   z.MobileBertEncoder.__init__.<locals>.<listcomp>)rR   rS   r   r   r   num_hidden_layerslayerr   rY   r   r#   rS     s   
$zMobileBertEncoder.__init__NFTr   r   r   r   output_hidden_statesreturn_dictr\   c                 C   s   |rdnd }|r
dnd }t | jD ]!\}	}
|r||f }|
||||	 |}|d }|r2||d f }q|r:||f }|sHtdd |||fD S t|||dS )Nr    r   r   c                 s   s    | ]	}|d ur|V  qd S rQ   r    )r!   vr    r    r#   r$   9  s    z,MobileBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r   r   tupler   )rW   r   r   r   r   r   r   Zall_hidden_statesZall_attentionsr   Zlayer_moduler   r    r    r#   r]     s,   	

zMobileBertEncoder.forward)NNFFT)r_   r`   ra   rS   rD   rb   r   r   r   r   r   r   r]   rc   r    r    rY   r#   r     s,    
r   c                       r   )MobileBertPoolerc                    s2   t    |j| _| jrt|j|j| _d S d S rQ   )rR   rS   Zclassifier_activationdo_activater   rr   rl   r   r   rY   r    r#   rS   @  s
   
zMobileBertPooler.__init__r   r\   c                 C   s2   |d d df }| j s|S | |}t|}|S )Nr   )r   r   rD   tanh)rW   r   Zfirst_token_tensorpooled_outputr    r    r#   r]   F  s   

zMobileBertPooler.forwardr^   r    r    rY   r#   r   ?  s    r   c                       r   )!MobileBertPredictionHeadTransformc                    sX   t    t|j|j| _t|jtrt	|j | _
n|j| _
td |j|jd| _d S )Nrd   r   )rR   rS   r   rr   rl   r   r   r   r   r   transform_act_fnrt   r   r   r   rY   r    r#   rS   S  s   
z*MobileBertPredictionHeadTransform.__init__r   r\   c                 C   s"   |  |}| |}| |}|S rQ   )r   r   r   r   r    r    r#   r]   \  s   


z)MobileBertPredictionHeadTransform.forwardr^   r    r    rY   r#   r   R  s    	r   c                       s<   e Zd Z fddZd
ddZdejdejfdd	Z  ZS )MobileBertLMPredictionHeadc                    sh   t    t|| _tj|j|j|j dd| _	tj|j|jdd| _
tt|j| _| j| j
_d S )NF)r(   )rR   rS   r   	transformr   rr   rn   rl   rk   r   decoderrT   rD   rU   r(   r   rY   r    r#   rS   d  s   

z#MobileBertLMPredictionHead.__init__r\   Nc                 C   s   | j | j_ d S rQ   )r(   r   rW   r    r    r#   _tie_weightso  s   z'MobileBertLMPredictionHead._tie_weightsr   c                 C   s>   |  |}|tj| jj | jjgdd}|| jj7 }|S )Nr   r   )	r   r   rD   r   r   r&   tr   r(   r   r    r    r#   r]   r  s   
$z"MobileBertLMPredictionHead.forward)r\   N)	r_   r`   ra   rS   r   rD   rb   r]   rc   r    r    rY   r#   r   c  s    
r   c                       r   )MobileBertOnlyMLMHeadc                    s   t    t|| _d S rQ   )rR   rS   r   predictionsr   rY   r    r#   rS   z  s   
zMobileBertOnlyMLMHead.__init__sequence_outputr\   c                 C      |  |}|S rQ   )r   )rW   r  prediction_scoresr    r    r#   r]   ~     
zMobileBertOnlyMLMHead.forwardr^   r    r    rY   r#   r   y      r   c                       s<   e Zd Z fddZdejdejdeej fddZ  ZS )MobileBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S Nr*   )rR   rS   r   r   r   rr   rl   seq_relationshipr   rY   r    r#   rS     s   

z#MobileBertPreTrainingHeads.__init__r  r   r\   c                 C   s   |  |}| |}||fS rQ   )r   r  )rW   r  r   r  seq_relationship_scorer    r    r#   r]     s   

z"MobileBertPreTrainingHeads.forwardr   r    r    rY   r#   r    s    (r  c                   @   s    e Zd ZeZeZdZdd ZdS )MobileBertPreTrainedModelr   c                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjtfrZ|jj	  |jjd dS t |trg|jj	  dS dS )zInitialize the weightsr   )meanZstdNg      ?)r   r   rr   r&   rE   Znormal_rG   Zinitializer_ranger(   Zzero_rm   rf   r   rP   Zfill_r   )rW   moduler    r    r#   _init_weights  s    


z'MobileBertPreTrainedModel._init_weightsN)	r_   r`   ra   r   Zconfig_classrO   Zload_tf_weightsZbase_model_prefixr  r    r    r    r#   r
    s
    r
  c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )MobileBertForPreTrainingOutputab  
    Output type of [`MobileBertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_logitsseq_relationship_logitsr   r   )r_   r`   ra   r   r  r   rD   r   __annotations__r  r  r   r   r   r    r    r    r#   r    s   
 r  c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Ze									dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeef fddZ  ZS )MobileBertModelz.
    https://arxiv.org/pdf/2004.02984.pdf
    Tc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rR   rS   rG   re   r   r   encoderr   pooler	post_init)rW   rG   add_pooling_layerrY   r    r#   rS     s   

zMobileBertModel.__init__c                 C   s   | j jS rQ   r   ro   r   r    r    r#   get_input_embeddings  s   z$MobileBertModel.get_input_embeddingsc                 C   s   || j _d S rQ   r  )rW   r   r    r    r#   set_input_embeddings  s   z$MobileBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rW   Zheads_to_pruner   r   r    r    r#   _prune_heads  s   zMobileBertModel._prune_headsNrz   r   r{   rg   r   r|   r   r   r   r\   c
                 C   sh  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d}|d }| jd ur| |nd }|	s||f|d	d   S t|||j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerh   z5You have to specify either input_ids or inputs_embeds)r   r}   )rz   rg   r{   r|   )r   r   r   r   r   r   r   )r   Zpooler_outputr   r   )rG   r   r   use_return_dict
ValueErrorZ%warn_if_padding_and_no_attention_maskr   r   rD   rV   rU   r   Zget_extended_attention_maskZget_head_maskr   r   r  r  r   r   r   )rW   rz   r   r{   rg   r   r|   r   r   r   r   r   Zextended_attention_maskZembedding_outputZencoder_outputsr  r   r    r    r#   r]     sP   
zMobileBertModel.forward)T)	NNNNNNNNN)r_   r`   ra   r   rS   r  r  r  r   r   rD   r   r   r   r   r   r   r]   rc   r    r    rY   r#   r    sJ    	

r  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    )Zcustom_introc                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																						ddeej deej deej deej deej deej deej deej deej deej deej deeef fddZ  ZS )MobileBertForPreTrainingcls.predictions.decoder.weightcls.predictions.decoder.biasc                    ,   t  | t|| _t|| _|   d S rQ   )rR   rS   r  r   r  clsr  r   rY   r    r#   rS   >  s   

z!MobileBertForPreTraining.__init__c                 C   
   | j jjS rQ   r#  r   r   r   r    r    r#   get_output_embeddingsF     
z.MobileBertForPreTraining.get_output_embeddingsc                 C      || j j_|j| j j_d S rQ   r#  r   r   r(   rW   Znew_embeddingsr    r    r#   set_output_embeddingsI     
z.MobileBertForPreTraining.set_output_embeddingsNnew_num_tokensr\   c                    *   | j | jjj|dd| jj_t j|dS NT)r-  Z
transposed)r-  Z_get_resized_lm_headr#  r   r   rR   resize_token_embeddingsrW   r-  rY   r    r#   r1  M  s   z0MobileBertForPreTraining.resize_token_embeddingsrz   r   r{   rg   r   r|   labelsnext_sentence_labelr   r   r   c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| ||\}}d}|durS|durSt }||d| j j|d}||dd|d}|| }|sj||f|dd  }|durh|f| S |S t||||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Nr   r{   rg   r   r|   r   r   r   r*   rh   )r  r  r  r   r   )
rG   r  r   r#  r   r   rn   r  r   r   )rW   rz   r   r{   rg   r   r|   r3  r4  r   r   r   r   r  r   r  r	  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   r    r    r#   r]   U  s<   +z MobileBertForPreTraining.forwardrQ   NNNNNNNNNNN)r_   r`   ra   _tied_weights_keysrS   r&  r+  r   r?   r   rm   r1  r   rD   r   r   r   r   r  r]   rc   r    r    rY   r#   r  5  sV    	

r  c                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )MobileBertForMaskedLMr   r!  c                    s6   t  | t|dd| _t|| _|| _|   d S NF)r  )rR   rS   r  r   r   r#  rG   r  r   rY   r    r#   rS     s
   
zMobileBertForMaskedLM.__init__c                 C   r$  rQ   r%  r   r    r    r#   r&    r'  z+MobileBertForMaskedLM.get_output_embeddingsc                 C   r(  rQ   r)  r*  r    r    r#   r+    r,  z+MobileBertForMaskedLM.set_output_embeddingsNr-  r\   c                    r.  r/  r0  r2  rY   r    r#   r1    s   z-MobileBertForMaskedLM.resize_token_embeddingsrz   r   r{   rg   r   r|   r3  r   r   r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr5  r   rh   r*   r  logitsr   r   )
rG   r  r   r#  r   r   rn   r   r   r   )rW   rz   r   r{   rg   r   r|   r3  r   r   r   r   r  r  r8  r7  r   r    r    r#   r]     s6   
zMobileBertForMaskedLM.forwardrQ   
NNNNNNNNNN)r_   r`   ra   r;  rS   r&  r+  r   r?   r   rm   r1  r   rD   r   r   r   r   r   r   r]   rc   r    r    rY   r#   r<    sP    		

r<  c                       r   )MobileBertOnlyNSPHeadc                    s   t    t|jd| _d S r  )rR   rS   r   rr   rl   r  r   rY   r    r#   rS     s   
zMobileBertOnlyNSPHead.__init__r   r\   c                 C   r  rQ   )r  )rW   r   r	  r    r    r#   r]     r  zMobileBertOnlyNSPHead.forwardr^   r    r    rY   r#   rA    r  rA  zZ
    MobileBert Model with a `next sentence prediction (classification)` head on top.
    c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )#MobileBertForNextSentencePredictionc                    r"  rQ   )rR   rS   r  r   rA  r#  r  r   rY   r    r#   rS     s   

z,MobileBertForNextSentencePrediction.__init__Nrz   r   r{   rg   r   r|   r3  r   r   r   r\   c                 K   s   d|v rt dt |d}|
dur|
n| jj}
| j||||||||	|
d	}|d }| |}d}|durEt }||	dd|	d}|
s[|f|dd  }|durY|f| S |S t
|||j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r4  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr5  r   rh   r*   r>  )warningswarnFutureWarningpoprG   r  r   r#  r   r   r   r   r   )rW   rz   r   r{   rg   r   r|   r3  r   r   r   kwargsr   r   r	  r9  r7  r   r    r    r#   r]     sB   )

z+MobileBertForNextSentencePrediction.forwardr@  )r_   r`   ra   rS   r   r   rD   r   r   r   r   r   r   r]   rc   r    r    rY   r#   rB    sH    		

rB  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee dee dee de	e
ej ef fddZ  ZS )#MobileBertForSequenceClassificationc                    sd   t  | |j| _|| _t|| _|jd ur|jn|j}t	|| _
t|j|j| _|   d S rQ   )rR   rS   
num_labelsrG   r  r   classifier_dropoutrw   r   rv   rx   rr   rl   r)   r  rW   rG   rK  rY   r    r#   rS   i  s   
z,MobileBertForSequenceClassification.__init__Nrz   r   r{   rg   r   r|   r3  r   r   r   r\   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr5  r   Z
regressionZsingle_label_classificationZmulti_label_classificationrh   r*   r>  )rG   r  r   rx   r)   Zproblem_typerJ  r~   rD   r   r?   r	   squeezer   r   r   r   r   r   )rW   rz   r   r{   rg   r   r|   r3  r   r   r   r   r   r?  r  r7  r   r    r    r#   r]   x  sV   



"


z+MobileBertForSequenceClassification.forwardr@  )r_   r`   ra   rS   r   r   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   rI  a  sH    	
rI  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee dee dee de	e
ej ef fddZ  ZS )MobileBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r=  )
rR   rS   rJ  r  r   r   rr   rl   
qa_outputsr  r   rY   r    r#   rS     s
   z'MobileBertForQuestionAnswering.__init__Nrz   r   r{   rg   r   r|   start_positionsend_positionsr   r   r   r\   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr5  r   r   rh   r   )Zignore_indexr*   )r  start_logits
end_logitsr   r   )rG   r  r   rO  r8   rM  r   r>   r   clampr   r   r   r   )rW   rz   r   r{   rg   r   r|   rP  rQ  r   r   r   r   r  r?  rR  rS  r6  Zignored_indexr7  Z
start_lossZend_lossr   r    r    r#   r]     sP   






z&MobileBertForQuestionAnswering.forwardr:  )r_   r`   ra   rS   r   r   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   rN    sN    
	
rN  c                       rH  )MobileBertForMultipleChoicec                    sT   t  | t|| _|jd ur|jn|j}t|| _t	|j
d| _|   d S )Nr   )rR   rS   r  r   rK  rw   r   rv   rx   rr   rl   r)   r  rL  rY   r    r#   rS     s   
z$MobileBertForMultipleChoice.__init__Nrz   r   r{   rg   r   r|   r3  r   r   r   r\   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rh   r   r5  r*   r>  )rG   r  rA   r   r   r   rx   r)   r   r   r   r   )rW   rz   r   r{   rg   r   r|   r3  r   r   r   Znum_choicesr   r   r?  Zreshaped_logitsr  r7  r   r    r    r#   r]      sL   ,


z#MobileBertForMultipleChoice.forwardr@  )r_   r`   ra   rS   r   r   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   rU    sH    	
rU  c                       rH  ) MobileBertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r=  )rR   rS   rJ  r  r   rK  rw   r   rv   rx   rr   rl   r)   r  rL  rY   r    r#   rS     s   z)MobileBertForTokenClassification.__init__Nrz   r   r{   rg   r   r|   r3  r   r   r   r\   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr5  r   rh   r*   r>  )rG   r  r   rx   r)   r   r   rJ  r   r   r   )rW   rz   r   r{   rg   r   r|   r3  r   r   r   r   r  r?  r  r7  r   r    r    r#   r]     s8   

z(MobileBertForTokenClassification.forwardr@  )r_   r`   ra   rS   r   r   rD   rb   r   r   r   r   r]   rc   r    r    rY   r#   rV  |  sH    	
rV  )r<  rU  rB  r  rN  rI  rV  r   r  r
  rO   )Kr   r0   rC  dataclassesr   typingr   r   r   rD   r   Ztorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_mobilebertr   Z
get_loggerr_   r.   rO   ModulerP   r   rt   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  r  r<  rA  rB  rI  rN  rU  rV  __all__r    r    r    r#   <module>   s   (

N
L:2$?*
!jiP
]YMjE