o
    Zh`                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& e$'e(Z)da*dd Z+dd Z,dRddZ-dRddZ.dRddZ/dd Z0G dd dej1j2Z3G dd  d ej1j2Z4G d!d" d"Z5dSd#d$Z6d%d& Z7			dTd'd(Z8G d)d* d*e
j9Z:G d+d, d,e
j9Z;G d-d. d.e
j9Z<G d/d0 d0e
j9Z=G d1d2 d2e
j9Z>G d3d4 d4e
j9Z?G d5d6 d6e
j9Z@G d7d8 d8e
j9ZAG d9d: d:e
j9ZBG d;d< d<e
j9ZCG d=d> d>e
j9ZDe!G d?d@ d@eZEe!G dAdB dBeEZFe!G dCdD dDeEZGG dEdF dFe
j9ZHe!dGdHG dIdJ dJeEZIe!G dKdL dLeEZJe!G dMdN dNeEZKe!G dOdP dPeEZLg dQZMdS )UzPyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                     sD   t t jjjd d   fdd} | g d}td|ddad S )	NZkernelsmrac                    s    fdd| D S )Nc                    s   g | ]} | qS  r   ).0fileZ
src_folderr   S/var/www/auris/lib/python3.10/site-packages/transformers/models/mra/modeling_mra.py
<listcomp>4       z:load_cuda_kernels.<locals>.append_root.<locals>.<listcomp>r   )filesr!   r   r"   append_root3   s   z&load_cuda_kernels.<locals>.append_root)zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppZcuda_kernelT)verbose)r   __file__resolveparentr
   mra_cuda_kernel)r&   Z	src_filesr   r!   r"   load_cuda_kernels/   s   r,   c                 C   s   t |  dkrtdt | dkrtd| ddkr#td| ddkr.td| jd	d
jdd	}| }| }| }t	||||\}}|dd	dddddddf }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr+   Z	index_max)sparse_qk_prodindicesquery_num_blockkey_num_blockZ
index_valsmax_valsmax_vals_scatterr   r   r"   
sparse_max;   s   $rC   r0   c                 C   s   t |  dkrtdt | dkrtd| jd |jd kr&td| j\}}|| }tj|dtj|jd}| |||} | |dddf ||  ddf } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r.   z$mask must be a 2-dimensional tensor.r/   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r5   r6   r7   shapetorcharangelongrF   reshape)maskr>   
block_size
batch_sizeseq_len	num_block	batch_idxr   r   r"   sparse_maskW   s   
&rR   c           	      C   s"  |   \}}}|  \}}}|| dkrtd|| dkr"td| ||| ||dd} |||| ||dd}t|   dkrJtdt|  dkrVtdt|  d	krbtd
|  ddkrmtd| ddkrxtd|  } | }| }| }t| || S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r4   r1   r-   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r.   r/   r   r0   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r6   r7   rK   r:   r5   r;   r<   r+   mm_to_sparse)	dense_query	dense_keyr>   rM   rN   Z
query_sizer3   _key_sizer   r   r"   rU   n   s.   rU   c           	      C   s  |  \}}}|| dkrtd|  d|krtd|  d|kr'td|||| ||dd}t|   d	krAtd
t|  d	krMtdt|  dkrYtd| ddkrdtd|  } | }| }| }t| |||}|dd||| |}|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   rS   r.   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r4   r1   r-   ,sparse_query must be a 4-dimensional tensor.rT   r/   r0   z8The size of the third dimension of dense_key must be 32.)	r6   r7   rK   r:   r5   r;   r<   r+   sparse_dense_mm)	sparse_queryr>   rW   r?   rM   rN   rY   r3   Zdense_qk_prodr   r   r"   r[      s.   r[   c                 C   s    | | | t j| |dd  S )NfloorZrounding_mode)rH   divrJ   )r>   Zdim_1_blockZdim_2_blockr   r   r"   transpose_indices   s    r`   c                   @   s2   e Zd Zedd Zedd Zed	ddZdS )
MraSampledDenseMatMulc                 C   &   t ||||}| ||| || _|S N)rU   save_for_backwardrM   )ctxrV   rW   r>   rM   r=   r   r   r"   forward      zMraSampledDenseMatMul.forwardc                 C   sj   | j \}}}| j}|d| }|d| }t|||}t|dd|||}	t||||}
|
|	d d fS Nr   r4   r1   )saved_tensorsrM   r6   r`   r[   r:   )re   gradrV   rW   r>   rM   r?   r@   	indices_Tgrad_key
grad_queryr   r   r"   backward   s   zMraSampledDenseMatMul.backwardr0   c                 C      t | |||S rc   )ra   apply)rV   rW   r>   rM   r   r   r"   operator_call      z#MraSampledDenseMatMul.operator_callNr0   __name__
__module____qualname__staticmethodrf   rn   rq   r   r   r   r"   ra      s    


ra   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )MraSparseDenseMatMulc                 C   rb   rc   )r[   rd   r?   )re   r\   r>   rW   r?   r=   r   r   r"   rf      rg   zMraSparseDenseMatMul.forwardc           
      C   s`   | j \}}}| j}|d|d }t|||}t|dd|||}t|||}	|	d |d fS rh   )ri   r?   r6   r`   r[   r:   rU   )
re   rj   r\   r>   rW   r?   r@   rk   rl   rm   r   r   r"   rn      s   zMraSparseDenseMatMul.backwardc                 C   ro   rc   )ry   rp   )r\   r>   rW   r?   r   r   r"   rq      rr   z"MraSparseDenseMatMul.operator_callNrt   r   r   r   r"   ry      s    

	ry   c                   @   s   e Zd Zedd ZdS )MraReduceSumc                 C   s  |   \}}}}t|   dkrtdt|  dkr td|   \}}}}|  \}}| jdd|| |} tj| dtj|jd}tj	||dd	 |d d d f |  || }	tj
|| |f| j| jd}
|
d|	| |||}|||| }|S )
Nr-   rZ   r.   r/   r2   r   rD   r]   r^   )r6   r5   r7   sumrK   rH   rI   rJ   rF   r_   zerosrE   Z	index_add)r\   r>   r?   r@   rN   rP   rM   rX   rQ   Zglobal_idxestempoutputr   r   r"   rq      s$   &
zMraReduceSum.operator_callN)ru   rv   rw   rx   rq   r   r   r   r"   rz      s    rz   c                 C   s  |   \}}}|| }d}	|durl||||jdd}
| ||||jdd|
dddddf d  }|||||jdd|
dddddf d  }|durk|||||jdd|
dddddf d  }	n5|tj||tj| jd }
| ||||jdd}|||||jdd}|dur|||||jdd}	t||	ddt
| }|jdddj}|dur|d	|
dddddf |
dddddf  d
k    }||
||	fS )z/
    Compute low resolution approximation.
    Nr4   r2   r1   ư>rD   T)r3   Zkeepdims     @g      ?)r6   rK   r{   rH   onesfloatrF   meanmatmulr:   mathsqrtr8   r9   )querykeyrM   rL   valuerN   rO   head_dimnum_block_per_row	value_hattoken_countZ	query_hatZkey_hatlow_resolution_logitlow_resolution_logit_row_maxr   r   r"   get_low_resolution_logit  s6   :r   c                 C   sT  | j \}}}|dkr3|d }tj||| jd}	tjtj|	| d|d}
| |
dddddf d  } |dkrk| ddd|ddf d | ddd|ddf< | ddddd|f d | ddddd|f< tj| |d|ddd	d
}|j}|dkr|j	j
ddj	}| |ddddf k }||fS |dkrd}||fS t| d)zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r.   rF   )ZdiagonalNg     @r4   TF)r3   Zlargestsortedfullr2   sparsez# is not a valid approx_model value.)rG   rH   r   rF   ZtrilZtriuZtopkrK   r>   r9   minr   r7   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrN   Ztotal_blocks_per_rowrX   offsetZ	temp_maskZdiagonal_maskZ
top_k_valsr>   	thresholdhigh_resolution_maskr   r   r"   get_block_idxes6  s.   r   c	           $      C   s  t du rt|  S |  \}	}
}}|	|
 }|| dkr!td|| }| |||} ||||}||||}|dure| |dddddf  } ||dddddf  }||dddddf  }|dkrvt| ||||\}}}}n(|dkrt  t| |||\}}}}W d   n1 sw   Y  nt	dt  || }t
|||||\}}W d   n1 sw   Y  tj| |||dt| }t||||\}}|| }|dur|dd	t||dddddddf    }t|}t||||}t||||}|dkrt|| d|  |dddddf  }t||dddddddf d	d	|d	|||}|jd
ddddddf d	d	|||}|d	d	|||| } |durq| | } t| | dk  }!||!dddddf  }||! }t|  | dk  }"||"dddddf  }||" }|| |dddddf |dddddf  d  }#n|dkr||dddddf d  }#nt	d|dur|#|dddddf  }#|#|	|
||}#|#S )z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rM   r   r   r4   r2   r   z-config.approx_mode must be "full" or "sparse")r+   rH   Z
zeros_likeZrequires_grad_r6   r7   rK   r   Zno_grad	Exceptionr   ra   rq   r   r   rC   rR   expry   rz   r   repeatr{   r   )$r   r   r   rL   r   r   rM   r   r   rN   Znum_headrO   r   Z
meta_batchr   r   r   r   r   rX   Zlow_resolution_logit_normalizedr>   r   Zhigh_resolution_logitrA   rB   Zhigh_resolution_attnZhigh_resolution_attn_outZhigh_resolution_normalizerZlow_resolution_attnZlow_resolution_attn_outZlow_resolution_normalizerZlog_correctionZlow_resolution_corrZhigh_resolution_corrcontext_layerr   r   r"   mra2_attention\  s   




.

"
.
.
 
r   c                       s*   e Zd ZdZ fddZdddZ  ZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|jd |j| _	t|j
|j| _tj|j|jd| _t|j| _| dt|jdd  t|dd| _| jdtj| j tj| jjd	d
d d S )N)padding_idxr.   Zepsposition_ids)r   r4   position_embedding_typeabsolutetoken_type_idsrD   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferrH   rI   expandgetattrr   r|   r   r6   rJ   rF   selfconfig	__class__r   r"   r     s   

zMraEmbeddings.__init__Nc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| jdkrn| |}|
|7 }
| |
}
| |
}
|
S )Nr4   r   r   r   rD   r   )r6   r   hasattrr   r   rH   r|   rJ   rF   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   r   r   r"   rf     s,   







zMraEmbeddings.forward)NNNNru   rv   rw   __doc__r   rf   __classcell__r   r   r   r"   r     s    r   c                       0   e Zd Zd fdd	Zdd ZdddZ  ZS )	MraSelfAttentionNc              
      s`  t    |j|j dkrt|dstd|j d|j dtd u}t rKt rK|sKzt	  W n t
yJ } ztd|  W Y d }~nd }~ww |j| _t|j|j | _| j| j | _t|j| j| _t|j| j| _t|j| j| _t|j| _|d ur|n|j| _|jd |j | _t| jt|jd d | _|j| _|j| _|j| _d S )	Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r0   r.   ) r   r   r   num_attention_headsr   r7   r+   r   r   r,   r   loggerwarningr<   attention_head_sizeall_head_sizer   Linearr   r   r   r   Zattention_probs_dropout_probr   r   r   Zblock_per_rowrP   r   r   r   r   )r   r   r   Zkernel_loadeder   r   r"   r   
  s:   


zMraSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr4   r   r.   r   r   )r6   r   r   viewpermute)r   layerZnew_layer_shaper   r   r"   transpose_for_scores-  s   
z%MraSelfAttention.transpose_for_scoresc              
   C   s  |  |}| | |}| | |}| |}| \}}}	}
d|d  }| d|d|| |	 }d}|
|k rt|||	||
 f}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t| | | | | j| j| j| jd}|
|k r|d d d d d d d |
f }||||	|
}|d	d
dd }| d d | jf }|j| }|f}|S )N      ?r   r   r0   r   r4   r2   )r   r   r   r   r.   r   r1   )r   r   r   r   r6   squeezer   rK   r<   rH   catr|   rF   r   r   rP   r   r   r   r   r;   r   r   )r   hidden_statesattention_maskZmixed_query_layerZ	key_layerZvalue_layerZquery_layerrN   Z	num_headsrO   r   Zgpu_warp_sizeZpad_sizer   Znew_context_layer_shapeoutputsr   r   r"   rf   2  s@   

  
zMraSelfAttention.forwardrc   )ru   rv   rw   r   r   rf   r   r   r   r   r"   r   	  s    #r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MraSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   r   r   denser   r   r   r   r   r   r   r   r"   r   g     
zMraSelfOutput.__init__r   input_tensorreturnc                 C   &   |  |}| |}| || }|S rc   r   r   r   r   r   r   r   r   r"   rf   m     

zMraSelfOutput.forwardru   rv   rw   r   rH   Tensorrf   r   r   r   r   r"   r   f      $r   c                       r   )	MraAttentionNc                    s.   t    t||d| _t|| _t | _d S )N)r   )r   r   r   r   r   r~   setpruned_heads)r   r   r   r   r   r"   r   u  s   

zMraAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r2   )r5   r   r   r   r   r   r   r   r   r   r~   r   r   union)r   headsindexr   r   r"   prune_heads{  s   zMraAttention.prune_headsc                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )r   r~   )r   r   r   Zself_outputsattention_outputr   r   r   r"   rf     s   zMraAttention.forwardrc   )ru   rv   rw   r   r   rf   r   r   r   r   r"   r   t  s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MraIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rc   )r   r   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   r   r   r"   r     s
   
zMraIntermediate.__init__r   r   c                 C      |  |}| |}|S rc   )r   r   r   r   r   r   r"   rf        

zMraIntermediate.forwardr   r   r   r   r"   r     s    r   c                       r   )	MraOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r     r   zMraOutput.__init__r   r   r   c                 C   r   rc   r   r   r   r   r"   rf     r   zMraOutput.forwardr   r   r   r   r"   r     r   r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	MraLayerc                    sB   t    |j| _d| _t|| _|j| _t|| _t	|| _
d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr   	attentionZadd_cross_attentionr   intermediater   r~   r   r   r   r"   r     s   


zMraLayer.__init__Nc                 C   sB   |  ||}|d }|dd  }t| j| j| j|}|f| }|S r   )r  r   feed_forward_chunkr  r  )r   r   r   Zself_attention_outputsr   r   layer_outputr   r   r"   rf     s   
zMraLayer.forwardc                 C   s   |  |}| ||}|S rc   )r  r~   )r   r   Zintermediate_outputr  r   r   r"   r    s   
zMraLayer.feed_forward_chunkrc   )ru   rv   rw   r   rf   r  r   r   r   r   r"   r    s    
	r  c                       s.   e Zd Z fddZ				dddZ  ZS )	
MraEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r   )r  )r   rX   r   r   r"   r#     r$   z'MraEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   Z
ModuleListrangenum_hidden_layersr   gradient_checkpointingr   r   r
  r"   r     s   
 
zMraEncoder.__init__NFTc           
      C   s   |rdnd }t | jD ]#\}}|r||f }| jr%| jr%| |j||}	n|||}	|	d }q|r6||f }|sCtdd ||fD S t||dS )Nr   r   c                 s   s    | ]	}|d ur|V  qd S rc   r   )r   vr   r   r"   	<genexpr>  s    z%MraEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )	enumerater   r  ZtrainingZ_gradient_checkpointing_func__call__tupler   )
r   r   r   	head_maskoutput_hidden_statesreturn_dictZall_hidden_statesiZlayer_moduleZlayer_outputsr   r   r"   rf     s(   



zMraEncoder.forward)NNFT)ru   rv   rw   r   rf   r   r   r   r   r"   r	    s    	r	  c                       r   )MraPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )r   r   r   r   r   r   r   r   r   r   transform_act_fnr   r   r   r   r   r"   r     s   
z#MraPredictionHeadTransform.__init__r   r   c                 C   s"   |  |}| |}| |}|S rc   )r   r  r   r   r   r   r"   rf     s   


z"MraPredictionHeadTransform.forwardr   r   r   r   r"   r    s    	r  c                       s,   e Zd Z fddZdd Zdd Z  ZS )MraLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)bias)r   r   r  	transformr   r   r   r   decoder	ParameterrH   r|   r  r   r   r   r"   r     s
   

zMraLMPredictionHead.__init__c                 C   s   | j | j_ d S rc   )r  r  r   r   r   r"   _tie_weights  s   z MraLMPredictionHead._tie_weightsc                 C   r   rc   )r  r  r   r   r   r"   rf     r   zMraLMPredictionHead.forward)ru   rv   rw   r   r   rf   r   r   r   r   r"   r    s    r  c                       r   )MraOnlyMLMHeadc                    s   t    t|| _d S rc   )r   r   r  predictionsr   r   r   r"   r   '  s   
zMraOnlyMLMHead.__init__sequence_outputr   c                 C   s   |  |}|S rc   )r"  )r   r#  prediction_scoresr   r   r"   rf   +  s   
zMraOnlyMLMHead.forwardr   r   r   r   r"   r!  &  s    r!  c                   @   s    e Zd ZeZdZdZdd ZdS )MraPreTrainedModelr   Tc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )r   ZstdNr   )r   r   r   weightdataZnormal_r   Zinitializer_ranger  Zzero_r   r   r   Zfill_)r   moduler   r   r"   _init_weights7  s   

z MraPreTrainedModel._init_weightsN)ru   rv   rw   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingr)  r   r   r   r"   r%  0  s
    r%  c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee deeef fddZ  ZS )MraModelc                    s2   t  | || _t|| _t|| _|   d S rc   )r   r   r   r   r   r	  encoder	post_initr   r   r   r"   r   J  s
   

zMraModel.__init__c                 C   s   | j jS rc   r   r   r  r   r   r"   get_input_embeddingsT  s   zMraModel.get_input_embeddingsc                 C   s   || j _d S rc   r-  )r   r   r   r   r"   set_input_embeddingsW  s   zMraModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr+  r   r  r   )r   Zheads_to_pruner   r   r   r   r"   _prune_headsZ  s   zMraModel._prune_headsNr   r   r   r   r  r   r  r  r   c	                 C   s|  |d ur|n| j j}|d ur|n| j j}|d ur |d ur td|d ur/| || | }	n|d ur<| d d }	ntd|	\}
}|d urK|jn|j}|d u r[tj|
|f|d}|d u rt	| j
drz| j
jd d d |f }||
|}|}n	tj|	tj|d}| ||	}| || j j}| j
||||d}| j|||||d}|d	 }|s|f|d
d   S t||j|j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer4   z5You have to specify either input_ids or inputs_embedsr   r   rD   )r   r   r   r   )r   r  r  r  r   r   )r  r   
attentionscross_attentions)r   r  use_return_dictr7   Z%warn_if_padding_and_no_attention_maskr6   rF   rH   r   r   r   r   r   r|   rJ   Zget_extended_attention_maskZget_head_maskr  r+  r   r   r2  r3  )r   r   r   r   r   r  r   r  r  r   rN   r   rF   r   r   Zextended_attention_maskZembedding_outputZencoder_outputsr#  r   r   r"   rf   b  sZ   
zMraModel.forward)NNNNNNNN)ru   rv   rw   r   r.  r/  r1  r   r   rH   r   boolr   r   r   rf   r   r   r   r   r"   r*  H  sB    
	

r*  c                       s   e Zd ZddgZ fddZdd Zdd Ze																		dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee deeef fddZ  ZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S rc   )r   r   r*  r   r!  clsr,  r   r   r   r"   r     s   

zMraForMaskedLM.__init__c                 C   s
   | j jjS rc   )r7  r"  r  r  r   r   r"   get_output_embeddings  s   
z$MraForMaskedLM.get_output_embeddingsc                 C   s   || j j_|j| j j_d S rc   )r7  r"  r  r  )r   Znew_embeddingsr   r   r"   set_output_embeddings  s   
z$MraForMaskedLM.set_output_embeddingsNr   r   r   r   r  r   labelsr  r  r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur7t }||d| j j|d}|	sM|f|
dd  }|durK|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   r  r   r  r  r   r4   r   losslogitsr   r2  )
r   r4  r   r7  r   r   r   r   r   r2  )r   r   r   r   r   r  r   r:  r  r  r   r#  r$  Zmasked_lm_lossloss_fctr~   r   r   r"   rf     s4   
zMraForMaskedLM.forward	NNNNNNNNN)ru   rv   rw   Z_tied_weights_keysr   r8  r9  r   r   rH   r   r5  r   r   r   rf   r   r   r   r   r"   r6    sH    		

r6  c                       s(   e Zd ZdZ fddZdd Z  ZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                    sF   t    t|j|j| _t|j| _t|j|j	| _
|| _d S rc   )r   r   r   r   r   r   r   r   r   
num_labelsout_projr   r   r   r   r"   r     s
   

zMraClassificationHead.__init__c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )r   r   r   r   r   rC  )r   featureskwargsxr   r   r"   rf     s   



zMraClassificationHead.forwardr   r   r   r   r"   rA    s    rA  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )Zcustom_introc                          e Zd Z fddZe									ddeej deej deej deej deej d	eej d
eej dee dee de	e
ef fddZ  ZS )MraForSequenceClassificationc                    s4   t  | |j| _t|| _t|| _|   d S rc   )r   r   rB  r*  r   rA  
classifierr,  r   r   r   r"   r     s
   

z%MraForSequenceClassification.__init__Nr   r   r   r   r  r   r:  r  r  r   c
              
   C   sf  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtjksG|jtj	krLd| j _nd| j _| j jdkrnt
 }| jdkrh|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr;  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr4   r<  )r   r4  r   rI  Zproblem_typerB  rE   rH   rJ   r<   r	   r   r   r   r   r   r   r2  )r   r   r   r   r   r  r   r:  r  r  r   r#  r>  r=  r?  r~   r   r   r"   rf     sR   


"


z$MraForSequenceClassification.forwardr@  )ru   rv   rw   r   r   r   rH   r   r5  r   r   r   rf   r   r   r   r   r"   rH    sB    		

rH  c                       rG  )MraForMultipleChoicec                    sD   t  | t|| _t|j|j| _t|jd| _| 	  d S r  )
r   r   r*  r   r   r   r   pre_classifierrI  r,  r   r   r   r"   r   e  s
   
zMraForMultipleChoice.__init__Nr   r   r   r   r  r   r:  r  r  r   c
              
   C   s  |	dur|	n| j j}	|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	d}|d }|dddf }| |}t |}| 	|}|d|
}d}|durt
 }|||}|	s|f|dd  }|dur|f| S |S t|||j|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r4   r1   r;  r   r<  )r   r4  rG   r   r6   r   rK  r   ZReLUrI  r   r   r   r2  )r   r   r   r   r   r  r   r:  r  r  Znum_choicesr   Zhidden_stateZpooled_outputr>  Zreshaped_logitsr=  r?  r~   r   r   r"   rf   o  sN   +


zMraForMultipleChoice.forwardr@  )ru   rv   rw   r   r   r   rH   r   r5  r   r   r   rf   r   r   r   r   r"   rJ  c  sB    
	

rJ  c                       rG  )MraForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rc   )r   r   rB  r*  r   r   r   r   r   r   r   rI  r,  r   r   r   r"   r     s   
z"MraForTokenClassification.__init__Nr   r   r   r   r  r   r:  r  r  r   c
              
   C   s  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|durdt }|durW|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|	sz|f|
dd  }|durx|f| S |S t|||
j|
jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr;  r   r4   r   r<  )r   r4  r   r   rI  r   r   rB  rH   whereZtensorignore_indexZtype_asr   r   r2  )r   r   r   r   r   r  r   r:  r  r  r   r#  r>  r=  r?  Zactive_lossZactive_logitsZactive_labelsr~   r   r   r"   rf     sD   

z!MraForTokenClassification.forwardr@  )ru   rv   rw   r   r   r   rH   r   r5  r   r   r   rf   r   r   r   r   r"   rL    sB    	

rL  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej deej dee dee de	e
ef fddZ  ZS )MraForQuestionAnsweringc                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S )Nr.   )
r   r   rB  r*  r   r   r   r   
qa_outputsr,  r   r   r   r"   r     s   
z MraForQuestionAnswering.__init__Nr   r   r   r   r  r   start_positionsend_positionsr  r  r   c              
   C   s>  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d}|d}d }|d ur}|d ur}t| dkrJ|d}t| dkrW|d}|d}|d|}|d|}t	|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t
||||j|jdS )	Nr;  r   r   r4   r2   )rN  r.   )r=  start_logits
end_logitsr   r2  )r   r4  r   rP  splitr   r5   r6   clampr   r   r   r2  )r   r   r   r   r   r  r   rQ  rR  r  r  r   r#  r>  rS  rT  Z
total_lossZignored_indexr?  Z
start_lossZend_lossr~   r   r   r"   rf   #  sN   








zMraForQuestionAnswering.forward)
NNNNNNNNNN)ru   rv   rw   r   r   r   rH   r   r5  r   r   r   rf   r   r   r   r   r"   rO    sH    	

rO  )r6  rJ  rO  rH  rL  r  r*  r%  rs   )NN)r0   r   r   )Nr   r   pathlibr   typingr   r   r   rH   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Ztorch.utils.cpp_extensionr
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   r   r   Zconfiguration_mrar   Z
get_loggerru   r   r+   r,   rC   rR   rU   r[   r`   ZautogradFunctionra   ry   rz   r   r   r   Moduler   r   r   r   r   r   r  r	  r  r  r!  r%  r*  r6  rA  rH  rJ  rL  rO  __all__r   r   r   r"   <module>   sz    



((
(-
s:]!,
gHOgIM