o
    Zh                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
  mZ d dlm
Z
 d dlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' e" rd dl(m)Z) d dl*m+Z+ d dl,m-Z- ne.Z+e#/e0Z1G dd dej2j3Z4		dYdeej5 dee6 fddZ7G dd de+Z8G dd de
j9Z:G dd de
j9Z;G dd  d e
j9Z<d!d" Z=dZd#d$Z>	%d[d&d'd(ej5d)ej5d*ej5d+eej? d,ee6e6f d-e6d.e6d/ee@ d0eeej5ej5f eej5 f fd1d2ZAejBfd&d'd(ej5d3e8dej5de6d,ee6e6f d-e6d.e6d4ejCd0eej5 fd5d6ZDd&d'd(ej5d)ej5d*ej5d+eej? d,ee6e6f d-e6d.e6d0eej5 fd7d8ZEeDeAeEd9ZFG d:d' d'e
j9ZGG d;d< d<e
j9ZHe!G d=d> d>eZI		dYd?ej5d)ej5d+eej5 d@eej5 d0eej5ej5ej5e6eej5 eej5 f f
dAdBZJd?ej5dCej5dDe6dEe6d0ej5f
dFdGZKe!G dHdI dIeIZLG dJdK dKe
j9ZMe!dLdMG dNdO dOeIZNe!dPdMG dQdR dReIZOe!dSdMG dTdU dUeIZPe!G dVdW dWeIZQg dXZRdS )\    N)nullcontext)DictOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   @   s>   e Zd Ze		ddeej dee fddZedd Z	dS )	ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc              
   C   sd   |  }|j\}}}}	|d d d df |d|	}
t|
||d||ddd | ||| || _|S )N   r   FT)seqlen_offsetsr    r!   interleavedinplace)
contiguousshapeviewr   Zsave_for_backwardr!   )ctxqkvcossinr    r!   	total_nnz_three_nheadsheaddimZqk r2   a/var/www/auris/lib/python3.10/site-packages/transformers/models/modernbert/modeling_modernbert.pyforward;   s    
zApplyRotaryEmbUnpad.forwardc           
      C   sn   | j \}}}| }|j\}}}}|d d d df |d|}	t|	||d|| jdddd	 |d d d d d d fS )Nr"   r#   r   FT)r$   r    r!   r%   r&   	conjugate)Zsaved_tensorsr'   r(   r)   r   r!   )
r*   dor,   r-   r    r.   r/   r0   r1   Zdqkr2   r2   r3   backwardZ   s    zApplyRotaryEmbUnpad.backwardNN)
__name__
__module____qualname__staticmethodr   torchTensorintr4   r7   r2   r2   r2   r3   r   :   s    r   r    r!   c                 C   s   t | ||||S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r+   r,   r-   r    r!   r2   r2   r3   apply_rotary_unpaddedq   s   rA   c                       s   e Zd ZdZ				ddededee deej deej	 f
 fd	d
Z
	ddejdejdee deejeejejf f fddZdefddZ  ZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
         @Ndimbaser!   devicedtypec                    sT   t  j||d|dd || _|dur$|dur&|dur(| j|||d dS dS dS dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        TF)rD   rE   Zpos_idx_in_fp32rF   r%   NrF   rG   )super__init__r!   _update_cos_sin_cache)selfrD   rE   r!   rF   rG   	__class__r2   r3   rJ      s
   z*ModernBertUnpaddedRotaryEmbedding.__init__r+   r    returnc                 C   s6   |dur| j ||j|jd t|| j| j||d}|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        NrH   r    r!   )rK   rF   rG   rA   Z_cos_cachedZ_sin_cached)rL   r+   r    r!   r2   r2   r3   r4      s   z)ModernBertUnpaddedRotaryEmbedding.forwardc                 C   s   d| j  d| j d| j S )Nzdim=z, base=z, scale_base=)rD   rE   Z
scale_baserL   r2   r2   r3   
extra_repr   s   z,ModernBertUnpaddedRotaryEmbedding.extra_repr)rC   NNNN)r9   r:   r;   __doc__r?   floatr   r=   rF   rG   rJ   r>   r   r   r4   strrR   __classcell__r2   r2   rM   r3   rB      s8    
rB   c                       sp   e Zd ZdZdef fddZejdddejdej	fd	d
Z
	ddeej deej	 dej	fddZ  ZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)Zpadding_idxZepsbias)rI   rJ   rY   r   Z	Embedding
vocab_sizehidden_sizeZpad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutZembedding_dropoutdroprL   rY   rM   r2   r3   rJ      s
   
zModernBertEmbeddings.__init__TZdynamic	input_idsrO   c                 C      |  | | |S rS   )rd   rb   r^   )rL   rg   r2   r2   r3   compiled_embeddings   s   z(ModernBertEmbeddings.compiled_embeddingsNinputs_embedsc                 C   sH   |d ur|  | |}|S | jjr| |n
|  | | |}|S rS   )rd   rb   rY   reference_compileri   r^   )rL   rg   rj   hidden_statesr2   r2   r3   r4      s   zModernBertEmbeddings.forwardr8   )r9   r:   r;   rT   r   rJ   r=   compile
LongTensorr>   ri   r   r4   rW   r2   r2   rM   r3   rX      s    
rX   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    rY   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr"   r[   )rI   rJ   rY   r   Linearr]   r?   Zintermediate_sizeZmlp_biasWir   Zhidden_activationactrc   Zmlp_dropoutrd   Wore   rM   r2   r3   rJ      s   
 zModernBertMLP.__init__rl   rO   c                 C   s2   |  |jddd\}}| | | || S )Nr"   r#   rD   )rr   chunkrt   rd   rs   )rL   rl   inputZgater2   r2   r3   r4      s   zModernBertMLP.forward)
r9   r:   r;   rT   r   rJ   r=   r>   r4   rW   r2   r2   rM   r3   ro      s    ro   c                
       sJ   e Zd Zd
dedededeej f fddZ	e
 edd	 Z  ZS )ModernBertRotaryEmbeddingNrY   rD   rE   rF   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| jd |||d\}| _| jd|dd | j| _d S )	Nrope_scaling	rope_typetypedefault)rD   rE   inv_freqF)
persistent)rI   rJ   hasattrry   getrz   max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrY   r   Zrope_init_fnattention_scalingZregister_bufferr}   Zoriginal_inv_freq)rL   rY   rD   rE   rF   r}   rM   r2   r3   rJ      s   
z"ModernBertRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r#   r   mpscpuF)device_typeenabledr"   ru   )rG   )r}   rU   expandr(   torF   
isinstancer{   rV   r=   Zautocast	transposecatr,   r   r-   rG   )
rL   xposition_idsZinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr,   r-   r2   r2   r3   r4     s   0&z!ModernBertRotaryEmbedding.forwardrS   )r9   r:   r;   r   r?   rU   r   r=   rF   rJ   no_gradr   r4   rW   r2   r2   rM   r3   rx      s
    &rx   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr#   r"   ru   )r(   r=   r   )r   x1Zx2r2   r2   r3   rotate_half  s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr,   r-   r   Zunsqueeze_dimZq_embedZk_embedr2   r2   r3   apply_rotary_pos_emb  s
   

r   FmoduleModernBertAttentionr+   attention_masksliding_window_maskr   local_attentionbsrD   output_attentionsrO   c	                 K   s   | j ||d\}
}|ddjdd\}}}t|||
|\}}| jd }t||dd| }|dkr6|}|| }tjj	|dtj
d	|j}tjj|| j| jd
}t||}|dd }||d|}|rn||fS |fS )Nr   r   r   r"   ru         ࿩r#   r#   r#   rD   rG   )ptraining)
rotary_embr   unbindr   head_dimr=   matmulr   
functionalZsoftmaxZfloat32r   rG   Zdropoutattention_dropoutr   r'   r)   )r   r+   r   r   r   r   r   rD   r   _kwargsr,   r-   querykeyvaluescaleZattn_weightsattn_outputr2   r2   r3   eager_attention_forward5  s    
r   r   target_dtypec	                 K   s   ||||d}|j tjtjfv}
|
r1|j }||}t|||| jr$| jnd| j|d}||}nt|||| jr;| jnd| j|d}|	||fS )NrP           )r    r!   	dropout_pZdeterministicZwindow_size)
rG   r=   float16bfloat16r   r   r   r   deterministic_flash_attnr)   )r   r+   r   r    r!   r   r   rD   r   r   Zconvert_dtypeZ
orig_dtypeattnr2   r2   r3   flash_attention_forwardZ  s.   
r   c                 K   s   | j ||d\}	}
|ddjdd\}}}t|||	|
\}}|dkr%|}tj|||| jr0| jnd|ddd }|	|d	|}|fS )
Nr   r   r   r"   ru   r   r   )r   Z	attn_maskr#   )
r   r   r   r   FZscaled_dot_product_attentionr   r   r'   r)   )r   r+   r   r   r   r   r   rD   r   r,   r-   r   r   r   r   r2   r2   r3   sdpa_attention_forward  s"   r   )flash_attention_2eagersdpac                       sR   e Zd ZdZddedee f fddZ	ddej	d	ee
 d
ej	fddZ  ZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    NrY   layer_idc                    s\  t    || _|| _|j|j dkr td|j d|j d|j| _|j| _|j| _	|j|j | _
| j
| j	 | _tj|jd| j |jd| _||j dkr[|jd |jd f| _nd| _|j}|j}| jdkrt|jd urq|j}|j}|jd	krt| j
||d
| _n	t|| j
|d| _tj|j|j|jd| _|jdkrt|jnt | _t | _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   rp   r"   r   r   )rD   r!   rE   )rY   rD   rE   r   )rI   rJ   rY   r   r]   Znum_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   rq   Zattention_biasWqkvZglobal_attn_every_n_layersr   Zglobal_rope_thetar   Zlocal_rope_theta_attn_implementationrB   r   rx   rt   rc   Identityout_dropsetZpruned_heads)rL   rY   r   Z
rope_thetar   rM   r2   r3   rJ     s<   




 zModernBertAttention.__init__Frl   r   rO   c              	   K   s   |  |}|jd }| jjdkr|dd| j| j}n||dd| j| j}t| jj | f|| j| j	|| j
|d|}|d }| | |}|f|dd   S )Nr   r   r#   r   )r+   r   r   r   rD   r   r   )r   r(   rY   r   r)   r   r   MODERNBERT_ATTENTION_FUNCTIONr   r   r   r   rt   )rL   rl   r   kwargsr+   r   attn_outputsr2   r2   r3   r4     s(   



zModernBertAttention.forwardrS   F)r9   r:   r;   rT   r   r   r?   rJ   r=   r>   boolr4   rW   r2   r2   rM   r3   r     s    	+c                       s   e Zd Zddedee f fddZejdddej	d	ej	fd
dZ
						ddej	deej	 deej	 deej deej	 dee dee d	ej	fddZ  ZS )ModernBertEncoderLayerNrY   r   c                    sp   t    || _|dkrt | _ntj|j|j|j	d| _t
||d| _tj|j|j|j	d| _t|| _d S )Nr   rZ   )rY   r   )rI   rJ   rY   r   r   	attn_normr_   r]   r`   ra   r   r   mlp_normro   mlp)rL   rY   r   rM   r2   r3   rJ      s   
zModernBertEncoderLayer.__init__Trf   rl   rO   c                 C      |  | |S rS   )r   r   rL   rl   r2   r2   r3   compiled_mlp     z#ModernBertEncoderLayer.compiled_mlpFr   r   r   r    r!   r   c           
   	   C   sf   | j | |||||||d}||d  }| jjr| |n| | |}	||	 }|f|dd   S )Nr   r   r   r    r!   r   r   r   )r   r   rY   rk   r   r   r   )
rL   rl   r   r   r   r    r!   r   r   Z
mlp_outputr2   r2   r3   r4     s    
	zModernBertEncoderLayer.forwardrS   )NNNNNF)r9   r:   r;   r   r   r?   rJ   r=   rm   r>   r   rn   r   r4   rW   r2   r2   rM   r3   r     s6    
	r   c                       s   e Zd ZeZdZdZddgZdZdZ	dZ
dejfddZe						dd
edeej deeeeeef f  def fddZdd Z fddZ  ZS )ModernBertPreTrainedModelmodelTrX   r   Fr   c                    sv  | j j  d u r
d dtjdtf fdd}| j j| j jtd| j j  | j j| j j	d d}t
|tr?||j|d	  d S t
|trV||j|d
  ||j|d  d S t
|trm||j|d
  ||j|d  d S t
|tr|||j|d  d S t
|tr||j|d  d S t
|tttfr||j|d  d S t
|tjr|jjd |jd ur|jj  d S d S d S )Nr   r   stdc                    sR   t jj| jd|  |  | d t| t jr%| jd ur't j| j d S d S d S )Nr   )meanr   ab)r   initZtrunc_normal_weightr   rq   r[   Zzeros_)r   r   Zcutoff_factorr2   r3   init_weight<  s   
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightg       @r   )inout	embedding	final_outr   r   r   r   g      ?) rY   Zinitializer_cutoff_factorr   ModulerU   Zinitializer_rangemathsqrtnum_hidden_layersr]   r   rX   r^   ro   rr   rt   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassification ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr_   r   dataZfill_r[   Zzero_)rL   r   r   Zstdsr2   r   r3   _init_weights7  s@   






z'ModernBertPreTrainedModel._init_weightsNuse_flash_attention_2torch_dtype
device_mapcheck_device_mapc              	      s^   |j d u r#d|_ z| j|tj|d|dW S  ttfy"   d |_ Y nw t j||tj||dS )Nr   F)r   r   Zhard_check_onlyr   )r   r   r   r   )Z_attn_implementation_internalZ_check_and_enable_flash_attn_2r=   r   r   ImportErrorrI   _autoset_attn_implementation)clsrY   r   r   r   r   rM   r2   r3   r   f  s(   

z6ModernBertPreTrainedModel._autoset_attn_implementationc                 C   s   | j jdu rd S t| dr!t| jdkr!| j jrtd d| j _| jjdkr4| j jr0td d| j _| jjdkrG| j jrCtd d| j _| j jd u rTt	 | j _d S d S )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rY   rk   r   lenr   loggerwarning_oncerF   r{   r   rQ   r2   r2   r3   _maybe_set_compile  s.   z,ModernBertPreTrainedModel._maybe_set_compilec                    s<   t  j|i |}| jjdv r| jjrtd d| j_|S )N>   TNzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rI   resize_token_embeddingsrY   rk   r   r   )rL   argsr   Zmodel_embedsrM   r2   r3   r     s   z1ModernBertPreTrainedModel.resize_token_embeddings)FNNT)r9   r:   r;   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnr   r   r   classmethodr   r   r=   rG   r   rV   r   r?   r   r   r   rW   r2   r2   rM   r3   r   -  s2    / r   inputslabelsc                 C   s   |j dtjd}tj| dd }t|  }tjj	
tj|dtjdd}|  dkr7|  | }n| j^}	}
}|	|
 }| j|g|R  | }|durV| | nd}|durb| | nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r#   r   F)as_tupler   )r   r   r"   N)sumr=   Zint32Znonzeroflattenr?   maxitemr   r   padZcumsumrD   r(   r)   )r   r   r   r   Zseqlens_in_batchindicesZmax_seqlen_in_batchr    Zunpadded_inputsbatchseqlenrestr(   Zunpadded_position_idsZunpadded_labelsr2   r2   r3   _unpad_modernbert_input  s   r  r  r  r	  c                 C   s   |   dkrtj|| | j| jd}| ||< |||}|S | j^}}tj|| g|R | j| jd}| ||< |j||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   )rG   rF   )rD   r=   ZzerosrG   rF   r)   r(   )r   r  r  r	  outputZpadded_inputs_r
  r2   r2   r3   _pad_modernbert_output  s   
"r  c                !       s  e Zd Zdef fddZdd Zdd Ze													dd	ee	j
 d
ee	j dee	j dee	j
 dee	j dee	j dee	j dee dee dee dee dee dee deee	jdf ef fddZd
e	jdede	jfddZ  ZS )ModernBertModelrY   c                    sf   t     | _t | _t fddt jD | _	tj
 j j jd| _d| _|   d S )Nc                    s   g | ]}t  |qS r2   )r   ).0r   rY   r2   r3   
<listcomp>  s    z,ModernBertModel.__init__.<locals>.<listcomp>rZ   F)rI   rJ   rY   rX   
embeddingsr   Z
ModuleListranger   layersr_   r]   r`   ra   
final_normgradient_checkpointing	post_initre   rM   r  r3   rJ     s   
zModernBertModel.__init__c                 C   s   | j jS rS   r  r^   rQ   r2   r2   r3   get_input_embeddings  s   z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S rS   r  )rL   r   r2   r2   r3   set_input_embeddings  s   z$ModernBertModel.set_input_embeddingsNrg   r   r   r   rj   r  r    r!   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrO   .c                    s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td|r.dnd}|r4dnd}|   |durD| ||  du rcdu rc|durZ|jdd \ n	|jdd \ |durj|jn|j}|du r|t	j
 f|t	jd}d}| j jdkrdu r|du r|du rd}|du rt	  t||d	^}}}}W d   n1 sw   Y  n#t||d	^}}}}n|du rt	j|d
d}| j||d\}}| j||d}| jD ]?}|r||f }| jr| jr| |j|||||||}n||||||||d}|d }|r"t|dkr"||d f }q|r+||f }| |}|rMt| d}|durMt fdd|D }|s\tdd |||fD S t|||dS )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsr2   r"   rH   Fr   T)r   r   )rF   r   )r   )rg   rj   r   r   r   r  r  r	  c                 3   s     | ]}t | d V  qdS )r!  N)r  )r  hsr  r  r  r2   r3   	<genexpr>  s
    
z*ModernBertModel.forward.<locals>.<genexpr>c                 s   s    | ]	}|d ur|V  qd S rS   r2   )r  vr2   r2   r3   r$    s    )last_hidden_staterl   
attentions)rY   r   r  use_return_dictr   r   Z%warn_if_padding_and_no_attention_maskr(   rF   r=   onesr   r   r   r  aranger   _update_attention_maskr  r  r  r   Z_gradient_checkpointing_func__call__r   r  r  tupler   )rL   rg   r   r   r   rj   r  r    r!   r  r  r   r  r  Zall_hidden_statesZall_self_attentionsrF   Zrepadr  rl   Zencoder_layerZlayer_outputsr2   r#  r3   r4     s   !



	


zModernBertModel.forwardc                 C   s   |r#| j jdkrtd d| j _n| j jdkr#td| j j d t|| j}t|jd 	d}t
||j }|| j jd k	d	d|j}|| t| jj}||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r"   r   )rY   r   r   r   r   rG   r=   r*  r(   r   absTr   r   rF   Zmasked_fillZlogical_notZfinfomin)rL   r   r   Zglobal_attention_maskrowsZdistanceZwindow_maskr   r2   r2   r3   r+    s&   
"z&ModernBertModel._update_attention_maskNNNNNNNNNNNNN)r9   r:   r;   r   rJ   r  r  r   r   r=   rn   r>   r?   r   r   r   r   r4   r+  rW   r2   r2   rM   r3   r    sb    	
 "r  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r   rY   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )NrZ   )rI   rJ   rY   r   rq   r]   Zclassifier_biasr   r   Zclassifier_activationrs   r_   r`   ra   rb   re   rM   r2   r3   rJ     s
   
z!ModernBertPredictionHead.__init__rl   rO   c                 C   rh   rS   )rb   rs   r   r   r2   r2   r3   r4     s   z ModernBertPredictionHead.forward)	r9   r:   r;   r   rJ   r=   r>   r4   rW   r2   r2   rM   r3   r     s    r   zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )Zcustom_introc                "       s   e Zd ZdgZdef fddZdd Zdejfdd	Z	e
jd
dde
jde
jfddZe														d!dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee dee dee dee dee dee deee
j ef fdd Z  ZS )"r   zdecoder.weightrY   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nrp   )rI   rJ   rY   r  r   r   headr   rq   r]   r\   Zdecoder_biasr   sparse_predictionsparse_pred_ignore_indexr  re   rM   r2   r3   rJ     s   



zModernBertForMaskedLM.__init__c                 C   s   | j S rS   r   rQ   r2   r2   r3   get_output_embeddings  s   z+ModernBertForMaskedLM.get_output_embeddingsnew_embeddingsc                 C   s
   || _ d S rS   r6  )rL   r8  r2   r2   r3   set_output_embeddings  s   
z+ModernBertForMaskedLM.set_output_embeddingsTrf   r  rO   c                 C   r   rS   )r   r3  )rL   r  r2   r2   r3   compiled_head  r   z#ModernBertForMaskedLM.compiled_headNrg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  c                 K   sb  |dur|n| j j}|   | j jdkr|du r|du r|	du r|
du r?|du r?|dur6|jdd \}
}n	|jdd \}
}|durF|jn|j}|du rXtj|
|f|tjd}|du rt	  t
||||d\}}}}	}}W d   n1 syw   Y  nt
||||d\}}}}	}}| j||||||||	|
||||d}|d }| jr|dur|d}||jd d}|| jk}|| }|| }| j jr| |n| | |}d}|dur| j||| j jd	}| j jdkr| j js|du rt nt	  t|||
|d
}W d   n	1 sw   Y  |s'|f}|dur%|f| S |S t|||j|jdS )r   Nr   r"   rH   )r   r   r   r   rg   r   r   r   rj   r  r    r!   r  r  r   r  r  r   r#   )r\   r!  losslogitsrl   r'  )rY   r(  r   r   r(   rF   r=   r)  r   r   r  r   r4  r)   r5  rk   r:  r   r3  loss_functionr\   Zrepad_logits_with_gradr   r  r   rl   r'  )rL   rg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  r   rF   outputsr&  Zmask_tokensr>  r=  r  r2   r2   r3   r4     s~   #


 zModernBertForMaskedLM.forwardNNNNNNNNNNNNNN)r9   r:   r;   Z_tied_weights_keysr   rJ   r7  r   rq   r9  r=   rm   r>   r:  r   r   rn   r?   r   r   r   r   r4   rW   r2   r2   rM   r3   r     sj    
	
r   z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                "          e Zd Zdef fddZe														ddeej deej	 deej	 deej	 d	eej	 d
eej	 deej	 deej	 dee
 dee
 dee
 dee dee dee deeej	 ef fddZ  ZS )r   rY   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S rS   )rI   rJ   
num_labelsrY   r  r   r   r3  r=   r   rc   classifier_dropoutrd   rq   r]   r   r  re   rM   r2   r3   rJ   W  s   

z,ModernBertForSequenceClassification.__init__Nrg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  rO   c                 K   s  |dur|n| j j}|   | j||||||||	|
||||d}|d }| j jdkr3|dddf }n| j jdkrK||d jdd|jdd	d
 }| |}| |}| 	|}d}|dur| j j
du r| jdkrpd| j _
n| jdkr|jtjks|jtjkrd| j _
nd| j _
| j j
dkrt }| jdkr|| | }n+|||}n%| j j
dkrt }||d| j|d}n| j j
dkrt }|||}|s|f}|dur|f| S |S t|||j|jdS )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr;  r   r   r   r#   r   ru   T)rD   ZkeepdimZ
regressionZsingle_label_classificationZmulti_label_classificationr<  )rY   r(  r   r   Zclassifier_poolingr   r  r3  rd   r   Zproblem_typerC  rG   r=   longr?   r
   squeezer	   r)   r   r   rl   r'  )rL   rg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  r   r@  r&  Zpooled_outputr>  r=  loss_fctr  r2   r2   r3   r4   d  sn   '




"


z+ModernBertForSequenceClassification.forwardrA  )r9   r:   r;   r   rJ   r   r   r=   rn   r>   r?   r   r   r   r   r4   rW   r2   r2   rM   r3   r   Q  s`    	
r   zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                "       rB  )r   rY   c                    V   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S rS   rI   rJ   rC  r  r   r   r3  r=   r   rc   rD  rd   rq   r]   r   r  re   rM   r2   r3   rJ     s   

z)ModernBertForTokenClassification.__init__Nrg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  rO   c                 C   s   |dur|n| j j}|   | j||||||||	|
||||d}|d }| |}| |}| |}d}|durIt }||d| j	|d}|s_|f|dd  }|dur]|f| S |S t
|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr;  r   r#   r   r<  )rY   r(  r   r   r3  rd   r   r	   r)   rC  r   rl   r'  )rL   rg   r   r   r   rj   r   r  r    r!   r  r  r   r  r  r@  r&  r>  r=  rG  r  r2   r2   r3   r4     sD   $


z(ModernBertForTokenClassification.forwardrA  )r9   r:   r;   r   rJ   r   r   r=   rn   r>   r?   r   r   r   r   r4   rW   r2   r2   rM   r3   r     s`    	
r   c                "       s   e Zd Zdef fddZe													ddeej deej deej deej d	eej d
eej deej deej dee	 dee	 dee	 dee
 dee
 dee
 deeej ef fddZ  ZS )r   rY   c                    rH  rS   rI  re   rM   r2   r3   rJ   .  s   

z'ModernBertForQuestionAnswering.__init__Nrg   r   r   r   start_positionsend_positionsr  r    r!   r  r  r   r  r  rO   c                 K   s  |dur|n| j j}|   | j|||||||	|
||||d}|d }| |}| |}| |}|jddd\}}|d	 }|d	 }d}|dur_|dur_| j
||||fi |}|sv||f|dd  }|durt|f| S |S t||||j|jdS )r   N)r   r   r   r  r    r!   r  r  r   r  r  r   r   r#   ru   )r=  start_logits
end_logitsrl   r'  )rY   r(  r   r   r3  rd   r   splitrF  r'   r?  r   rl   r'  )rL   rg   r   r   r   rJ  rK  r  r    r!   r  r  r   r  r  r   r@  r&  r>  rL  rM  r=  r  r2   r2   r3   r4   9  sH   #


z&ModernBertForQuestionAnswering.forwardr2  )r9   r:   r;   r   rJ   r   r   r=   r>   r?   r   r   r   r   r4   rW   r2   r2   rM   r3   r   ,  s^    	
r   )r  r   r   r   r   r   r8   )Nr   r   )Sr   
contextlibr   typingr   r   r   r   r=   Ztorch.nn.functionalr   r   r   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   utilsr   r   r   Zutils.import_utilsr   Zconfiguration_modernbertr   Zflash_attn.flash_attn_interfacer   Zflash_attn.layers.rotaryr   Zflash_attn.ops.triton.rotaryr   objectZ
get_loggerr9   r   ZautogradFunctionr   r>   r?   rA   rB   r   rX   ro   rx   r   r   rn   r   r   r   rG   r   r   r   r   r   r   r  r  r  r   r   r   r   r   __all__r2   r2   r2   r3   <module>   s.  
;
5!
$
	
.
	
+


$P. 	$
)
 : wZ[