o
    Zhj                     @   sZ  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) e'*e+Z,dd Z-G dd dej.Z/G dd dej.Z0G dd de0Z1e0e1dZ2G dd dej.Z3G dd dej.Z4G dd dej.Z5e&G dd  d eZ6eG d!d" d"e%Z7e&G d#d$ d$e6Z8e&d%d&G d'd( d(e6Z9G d)d* d*ej.Z:G d+d, d,ej.Z;e&G d-d. d.e6Z<e&d/d&G d0d1 d1e6Z=e&G d2d3 d3e6Z>e&G d4d5 d5e6Z?e&G d6d7 d7e6Z@g d8ZAdS )9zPyTorch ALBERT model.    N)	dataclass)DictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indices"is_torch_greater_or_equal_than_2_2prune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc                 C   s*  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}t|
 q\t||	D ]\}
}|
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
d d!}
|
d"d#}
|
d$d%}
t|
dd&krd'|
v sd(|
v rd)|
 }
d*|
v r|
d+d,}
|
d-d.}
|
d}
d/|
v s!d0|
v s!d1|
v s!d2|
v s!d3|
v r-t	d4d|
  qj| }|
D ]}|d5|rA|d6|}n|g}|d d7ksR|d d8krXt|d.}nN|d d'ksf|d d9krlt|d:}n:|d d(kryt|d.}n-|d d;krt|d<}n z	t||d }W n ty   t	d4d|
  Y q1w t|d=krt|d& }|| }q1|d>d d?krt|d.}n
|d7kr||}z|j|jkrtd@|j dA|j dBW n ty } z| j|j|jf7  _ d}~ww tdC|
 dD|  t||_qj| S )Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ Zffn_1ffnzbert/zalbert/Zattention_1	attentionz
transform/ZLayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsZinner_group_zalbert_layers/Zgroup_zalbert_layer_groups/r   Zoutput_biasZoutput_weightszclassifier/Zseq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightZadam_mZadam_vZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)ZkernelgammabetabiasZsquad
classifier   iZ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorchZ
from_numpydata)modelconfigZtf_checkpoint_pathr0   nptfZtf_pathZ	init_varsnamesZarraysnamerF   arrayoriginal_nameZpointerZm_nameZscope_namesnume rU   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albert3   s   

"








rW   c                       sn   e Zd ZdZdef fddZ					ddeej deej d	eej d
eej	 de
dejfddZ  ZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    rL   c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _| jd	tj| j tjd
dd d S )N)padding_idxZepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddingsr$   layer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferrI   arangeexpandrB   r^   zerosr[   sizelongselfrL   	__class__rU   rV   rd      s   

zAlbertEmbeddings.__init__Nr   	input_idsr`   r[   inputs_embedspast_key_values_lengthreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nr\   r   r`   r   rb   devicer_   )rs   r[   hasattrr`   rq   rI   rr   rt   r~   rh   rk   r^   rj   r$   ro   )rv   ry   r`   r[   rz   r{   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrk   r(   rj   rU   rU   rV   forward   s,   







zAlbertEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r   rd   r   rI   
LongTensorFloatTensorrD   Tensorr   __classcell__rU   rU   rw   rV   rX      s*    rX   c                       s   e Zd Zdef fddZdejdejfddZdee	 dd	fd
dZ
					ddejdeej deej dedeeej eejejf f f
ddZ  ZS )AlbertAttentionrL   c                    s4  t    |j|j dkrt|dstd|j d|j |j| _|j| _|j|j | _| j| j | _t	|j| j| _
t	|j| j| _t	|j| j| _t|j| _t|j| _t	|j|j| _tj|j|jd| _t | _t|dd| _| jdks| jd	kr|j| _td
|j d | j| _d S d S )Nr   rg   zThe hidden size (z6) is not a multiple of the number of attention heads (rZ   r^   r_   relative_keyrelative_key_queryr/   r   )rc   rd   hidden_sizenum_attention_headsr   rG   attention_head_sizeall_head_sizer   Linearquerykeyvaluerm   attention_probs_dropout_probattention_dropoutrn   output_dropoutdenser$   rl   setpruned_headsrB   r^   ri   re   distance_embeddingru   rw   rU   rV   rd      s0   

zAlbertAttention.__init__xr|   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr\   r   r/   r   r   )rs   r   r   viewZpermute)rv   r   Znew_x_shaperU   rU   rV   transpose_for_scores  s   
z$AlbertAttention.transpose_for_scoresheadsNc                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S )Nr   r   dim)r>   r   r   r   r   r   r   r   r   r   r   union)rv   r   indexrU   rU   rV   prune_heads  s   zAlbertAttention.prune_headsFhidden_statesattention_mask	head_maskoutput_attentionsc                 C   s  |  |}| |}| |}| |}| |}	| |}
t||	dd}|t| j	 }|d ur8|| }| j
dksB| j
dkr| d }tj|tj|jddd}tj|tj|jddd}|| }| || j d }|j|jd}| j
dkrtd||}|| }n| j
dkrtd||}td	|	|}|| | }tjj|dd
}| |}|d ur|| }t||
}|ddd}| |}| |}| || }|r||fS |fS )Nr\   r   r   r   r}   ra   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r/   )r   r   r   r   rI   matmulrE   mathsqrtr   r^   rs   rp   rt   r~   r   r   ri   torb   Zeinsumr   
functionalZsoftmaxr   flattenr   r   r$   )rv   r   r   r   r   Zmixed_query_layerZmixed_key_layerZmixed_value_layerquery_layer	key_layervalue_layerZattention_scoresr   Zposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layerrU   rU   rV   r   *  sD   











zAlbertAttention.forwardNNF)r   r   r   r   rd   rI   r   r   r   rD   r   r   r   boolr   r   r   r   rU   rU   rw   rV   r      s$    r   c                       sn   e Zd Z fddZ			ddejdeej deej ded	e	e
ej e
ejejf f f
 fd
dZ  ZS )AlbertSdpaAttentionc                    s    t  | |j| _t | _d S N)rc   rd   r   dropout_probr   require_contiguous_qkvru   rw   rU   rV   rd   f  s   zAlbertSdpaAttention.__init__NFr   r   r   r   r|   c                    s  | j dks|rtd t j|||dS | \}}}| | |}| | |}	| | 	|}
| j
rM|jjdkrM|d urM| }|	 }	|
 }
tjjj||	|
|| jr[| jnddd}|dd	}|||| j}| |}| |}| || }|fS )
Nr_   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   cuda        F)r   r   r   Z	attn_maskZ	dropout_pZ	is_causalr   r/   )r^   r3   warningrc   r   rs   r   r   r   r   r   r~   type
contiguousrI   r   r   Zscaled_dot_product_attentionZtrainingr   rE   Zreshaper   r   r   r$   )rv   r   r   r   r   
batch_sizeZseq_len_r   r   r   attention_outputr   r   r   rw   rU   rV   r   k  s6   	

zAlbertSdpaAttention.forwardr   )r   r   r   rd   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r   e  s     r   )eagersdpac                       s~   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
ejejf fddZdejdejfddZ  ZS )AlbertLayerrL   c                    s   t    || _|j| _d| _tj|j|jd| _	t
|j || _t|j|j| _t|j|j| _t|j | _t|j| _d S )Nr   rZ   )rc   rd   rL   chunk_size_feed_forwardseq_len_dimr   r$   r   rl   r#   ALBERT_ATTENTION_CLASSES_attn_implementationr"   r   Zintermediate_sizer!   
ffn_outputr   
hidden_act
activationrm   rn   ro   ru   rw   rU   rV   rd     s   
zAlbertLayer.__init__NFr   r   r   r   output_hidden_statesr|   c                 C   sL   |  ||||}t| j| j| j|d }| ||d  }|f|dd   S )Nr   r   )r"   r   ff_chunkr   r   r#   )rv   r   r   r   r   r   r   r   rU   rU   rV   r     s   zAlbertLayer.forwardr   c                 C   s"   |  |}| |}| |}|S r   )r!   r   r   )rv   r   r   rU   rU   rV   r     s   


zAlbertLayer.ff_chunkNNFF)r   r   r   r   rd   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r     s(    
r   c                       st   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
eeje
ej f df fddZ  ZS )AlbertLayerGrouprL   c                    s.   t    t fddt jD | _d S )Nc                       g | ]}t  qS rU   )r   .0r   rL   rU   rV   
<listcomp>      z-AlbertLayerGroup.__init__.<locals>.<listcomp>)rc   rd   r   
ModuleListrangeinner_group_numalbert_layersru   rw   r   rV   rd     s   
$zAlbertLayerGroup.__init__NFr   r   r   r   r   r|   .c                 C   s|   d}d}t | jD ]!\}}	|	|||| |}
|
d }|r#||
d f }|r*||f }q	|f}|r5||f }|r<||f }|S )NrU   r   r   )	enumerater   )rv   r   r   r   r   r   Zlayer_hidden_statesZlayer_attentionsZlayer_indexZalbert_layerZlayer_outputoutputsrU   rU   rV   r     s    


zAlbertLayerGroup.forwardr   )r   r   r   r   rd   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r     s&    r   c                       sj   e Zd Zdef fddZ					ddejdeej d	eej d
e	de	de	de
eef fddZ  ZS )AlbertTransformerrL   c                    sF   t     | _t j j| _t fddt	 j
D | _d S )Nc                    r   rU   )r   r   r   rU   rV   r     r   z.AlbertTransformer.__init__.<locals>.<listcomp>)rc   rd   rL   r   r   rg   r   embedding_hidden_mapping_inr   r   num_hidden_groupsalbert_layer_groupsru   rw   r   rV   rd     s   
$zAlbertTransformer.__init__NFTr   r   r   r   r   return_dictr|   c                 C   s   |  |}|r
|fnd }|rdnd }|d u rd g| jj n|}t| jjD ]@}	t| jj| jj }
t|	| jj| jj  }| j| |||||
 |d |
  ||}|d }|r^||d  }|re||f }q%|sttdd |||fD S t|||dS )NrU   r   r   r\   c                 s   s    | ]	}|d ur|V  qd S r   rU   )r   vrU   rU   rV   	<genexpr>  s    z,AlbertTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	r   rL   num_hidden_layersr   rD   r   r   tupler   )rv   r   r   r   r   r   r   Zall_hidden_statesZall_attentionsiZlayers_per_group	group_idxZlayer_group_outputrU   rU   rV   r     s2   
	
zAlbertTransformer.forward)NNFFT)r   r   r   r   rd   rI   r   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r     s,    

r   c                   @   s$   e Zd ZeZeZdZdZdd Z	dS )AlbertPreTrainedModelalbertTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weights.r   )meanZstdN      ?)
isinstancer   r   r*   rJ   Znormal_rL   Zinitializer_ranger-   Zzero_re   rY   r$   Zfill_AlbertMLMHead)rv   modulerU   rU   rV   _init_weights)  s    


z#AlbertPreTrainedModel._init_weightsN)
r   r   r   r   config_classrW   Zload_tf_weightsbase_model_prefixZ_supports_sdpar   rU   rU   rU   rV   r   "  s    r   c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )AlbertForPreTrainingOutputaQ  
    Output type of [`AlbertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_logits
sop_logitsr   r   )r   r   r   r   r   r   rI   r   __annotations__r   r   r   r   r   rU   rU   rU   rV   r   <  s   
 r   c                       s   e Zd ZeZdZddedef fddZdej	fdd	Z
d
ej	ddfddZdeeee f ddfddZe									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertModelr   TrL   add_pooling_layerc                    sp   t  | || _t|| _t|| _|r$t|j	|j	| _
t | _nd| _
d| _|j| _|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)rc   rd   rL   rX   r(   r   encoderr   r   r   r&   ZTanhpooler_activationr   attn_implementationr^   	post_init)rv   rL   r   rw   rU   rV   rd   c  s   

zAlbertModel.__init__r|   c                 C      | j jS r   r(   rh   rv   rU   rU   rV   get_input_embeddingsz     z AlbertModel.get_input_embeddingsr   Nc                 C      || j _d S r   r  )rv   r   rU   rU   rV   set_input_embeddings}     z AlbertModel.set_input_embeddingsheads_to_prunec                 C   sT   |  D ]#\}}t|| jj }t||| jj  }| jj| j| j| qdS )a  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	itemsrD   rL   r   r   r   r   r"   r   )rv   r  layerr   r   Zinner_group_idxrU   rU   rV   _prune_heads  s
   zAlbertModel._prune_headsry   r   r`   r[   r   rz   r   r   r   c
                 C   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u rctj	|
|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| j||||d}| jdko| jd	ko|d u o| }|rt||j|d
}n|dd}|j| jd}d| t| jj }| || j j}| j||||||	d}|d }| jd ur| | |d d df nd }|	s||f|dd   S t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer\   z5You have to specify either input_ids or inputs_embeds)r~   r`   r}   )r[   r`   rz   r   r_   )Ztgt_lenr   r/   ra   r   )r   r   r   r   r   )r   Zpooler_outputr   r   ) rL   r   r   use_return_dictrG   Z%warn_if_padding_and_no_attention_maskrs   r~   rI   Zonesr   r(   r`   rq   rr   rt   r  r^   r   rb   Z	unsqueezer   ZfinfominZget_head_maskr   r   r&   r   r   r   r   )rv   ry   r   r`   r[   r   rz   r   r   r   r   r   r   r~   r   r   Zembedding_outputZuse_sdpa_attention_maskZextended_attention_maskZencoder_outputssequence_outputpooled_outputrU   rU   rV   r     st   

	*zAlbertModel.forward)T)	NNNNNNNNN)r   r   r   r   r   r   r   rd   r   re   r  r	  r   rD   r   r  r   r   rI   r   r   r   r   r   r   r   rU   rU   rw   rV   r   ^  sL    	

r   z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    )Zcustom_introc                       s   e Zd ZddgZdef fddZdejfddZd	ejdd
fddZ	dej
fddZe	
	
	
	
	
	
	
	
	
	
	
ddeej deej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForPreTrainingpredictions.decoder.biaspredictions.decoder.weightrL   c                    s6   t  | t|| _t|| _t|| _|   d S r   )	rc   rd   r   r   r   r'   AlbertSOPHeadsop_classifierr  ru   rw   rU   rV   rd     s
   


zAlbertForPreTraining.__init__r|   c                 C   r  r   r'   decoderr  rU   rU   rV   get_output_embeddings  r  z*AlbertForPreTraining.get_output_embeddingsnew_embeddingsNc                 C   r  r   r  rv   r  rU   rU   rV   set_output_embeddings  r
  z*AlbertForPreTraining.set_output_embeddingsc                 C   
   | j jjS r   r   r(   rh   r  rU   rU   rV   r       
z)AlbertForPreTraining.get_input_embeddingsry   r   r`   r[   r   rz   labelssentence_order_labelr   r   r   c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| |}| |}d}|durU|durUt }||d| j j|d}||dd|d}|| }|sl||f|dd  }|durj|f| S |S t||||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Nr   r`   r[   r   rz   r   r   r   r/   r\   )r   r   r   r   r   )rL   r  r   r'   r  r
   r   rf   r   r   r   )rv   ry   r   r`   r[   r   rz   r!  r"  r   r   r   r   r  r  prediction_scoresZ
sop_scores
total_lossloss_fctmasked_lm_lossZsentence_order_lossoutputrU   rU   rV   r     s>   )

zAlbertForPreTraining.forwardNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rd   r   r   r  r  re   r  r   r   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r    sV    
	

r  c                       sB   e Zd Zdef fddZdejdejfddZdd	d
Z  Z	S )r   rL   c                    sp   t    tj|j|jd| _tt|j	| _
t|j|j| _t|j|j	| _t|j | _| j
| j_
d S )NrZ   )rc   rd   r   r$   rg   rl   	ParameterrI   rr   rf   r-   r   r   r   r  r   r   r   ru   rw   rU   rV   rd   V  s   
zAlbertMLMHead.__init__r   r|   c                 C   s0   |  |}| |}| |}| |}|}|S r   )r   r   r$   r  )rv   r   r$  rU   rU   rV   r   `  s   



zAlbertMLMHead.forwardNc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r  r-   r~   r   r  rU   rU   rV   _tie_weightsj  s   zAlbertMLMHead._tie_weights)r|   N)
r   r   r   r   rd   rI   r   r   r-  r   rU   rU   rw   rV   r   U  s    

r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r  rL   c                    s.   t    t|j| _t|j|j| _	d S r   )
rc   rd   r   rm   classifier_dropout_probro   r   r   
num_labelsr.   ru   rw   rU   rV   rd   t  s   
zAlbertSOPHead.__init__r  r|   c                 C   s   |  |}| |}|S r   )ro   r.   )rv   r  Zdropout_pooled_outputlogitsrU   rU   rV   r   z  s   

zAlbertSOPHead.forward)	r   r   r   r   rd   rI   r   r   r   rU   rU   rw   rV   r  s  s    r  c                       s   e Zd ZddgZ fddZdejfddZdejdd	fd
dZdej	fddZ
e																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForMaskedLMr  r  c                    s0   t  | t|dd| _t|| _|   d S NF)r   )rc   rd   r   r   r   r'   r  ru   rw   rU   rV   rd     s   
zAlbertForMaskedLM.__init__r|   c                 C   r  r   r  r  rU   rU   rV   r    r  z'AlbertForMaskedLM.get_output_embeddingsr  Nc                 C   s   || j _|j| j _d S r   )r'   r  r-   r  rU   rU   rV   r    s   z'AlbertForMaskedLM.set_output_embeddingsc                 C   r  r   r  r  rU   rU   rV   r    r   z&AlbertForMaskedLM.get_input_embeddingsry   r   r`   r[   r   rz   r!  r   r   r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N	ry   r   r`   r[   r   rz   r   r   r   r   r\   r/   r   r0  r   r   )
rL   r  r   r'   r
   r   rf   r   r   r   )rv   ry   r   r`   r[   r   rz   r!  r   r   r   r   Zsequence_outputsr$  r'  r&  r(  rU   rU   rV   r     s6   1
zAlbertForMaskedLM.forward
NNNNNNNNNN)r   r   r   r*  rd   r   r   r  r  re   r  r   r   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r1    sP    		

r1  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Zdef fddZe										ddeej deej	 deej deej d	eej	 d
eej	 deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForSequenceClassificationrL   c                    sR   t  | |j| _|| _t|| _t|j| _	t
|j| jj| _|   d S r   )rc   rd   r/  rL   r   r   r   rm   r.  ro   r   r   r.   r  ru   rw   rU   rV   rd     s   
z(AlbertForSequenceClassification.__init__Nry   r   r`   r[   r   rz   r!  r   r   r   r|   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr3  r   Z
regressionZsingle_label_classificationZmulti_label_classificationr\   r/   r4  )rL   r  r   ro   r.   Zproblem_typer/  rb   rI   rt   rD   r   squeezer
   r   r	   r   r   r   )rv   ry   r   r`   r[   r   rz   r!  r   r   r   r   r  r0  r   r&  r(  rU   rU   rV   r     sV   



"


z'AlbertForSequenceClassification.forwardr5  )r   r   r   r   rd   r   r   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r7    sH    	

r7  c                       r6  )AlbertForTokenClassificationrL   c                    sd   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j| jj| _|   d S r2  )rc   rd   r/  r   r   r.  rn   r   rm   ro   r   r   rL   r.   r  )rv   rL   r.  rw   rU   rV   rd   I  s   
z%AlbertForTokenClassification.__init__Nry   r   r`   r[   r   rz   r!  r   r   r   r|   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr#  r   r\   r/   r4  )rL   r  r   ro   r.   r
   r   r/  r   r   r   )rv   ry   r   r`   r[   r   rz   r!  r   r   r   r   r  r0  r   r&  r(  rU   rU   rV   r   Y  s8   

z$AlbertForTokenClassification.forwardr5  )r   r   r   r   rd   r   r   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r9  G  sH    	

r9  c                       s   e Zd Zdef fddZe											ddeej deej	 deej deej d	eej	 d
eej	 deej deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForQuestionAnsweringrL   c                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r2  )
rc   rd   r/  r   r   r   r   r   
qa_outputsr  ru   rw   rU   rV   rd     s
   z#AlbertForQuestionAnswering.__init__Nry   r   r`   r[   r   rz   start_positionsend_positionsr   r   r   r|   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr3  r   r   r\   r   )Zignore_indexr/   )r   start_logits
end_logitsr   r   )rL   r  r   r;  r?   r8  r   r>   rs   clampr
   r   r   r   )rv   ry   r   r`   r[   r   rz   r<  r=  r   r   r   r   r  r0  r>  r?  r%  Zignored_indexr&  Z
start_lossZend_lossr(  rU   rU   rV   r     sP   






z"AlbertForQuestionAnswering.forwardr)  r   r   r   r   rd   r   r   rI   r   r   r   r   r   r   r   r   rU   rU   rw   rV   r:    sN    
	

r:  c                       r6  )AlbertForMultipleChoicerL   c                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )rc   rd   r   r   r   rm   r.  ro   r   r   r.   r  ru   rw   rU   rV   rd     s
   
z AlbertForMultipleChoice.__init__Nry   r   r`   r[   r   rz   r!  r   r   r   r|   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   r\   r   r#  r/   r4  )rL   r  rF   r   rs   r   ro   r.   r
   r   r   r   )rv   ry   r   r`   r[   r   rz   r!  r   r   r   Znum_choicesr   r  r0  Zreshaped_logitsr   r&  r(  rU   rU   rV   r     sL   ,


zAlbertForMultipleChoice.forwardr5  rA  rU   rU   rw   rV   rB    sH    
	

rB  )	rW   r   r   r  r1  r7  r9  r:  rB  )Br   r   r5   dataclassesr   typingr   r   r   r   r   rI   r   Ztorch.nnr	   r
   r   Zactivationsr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   utilsr   r   r   Zconfiguration_albertr   Z
get_loggerr   r3   rW   ModulerX   r   r   r   r   r   r   r   r   r   r  r   r  r1  r7  r9  r:  rB  __all__rU   rU   rU   rV   <module>   sh   $	
~Dp7)#5! fiWGMf