
    fTh                    v   S SK r S SKJr  S SKJrJrJrJrJr  S SK	r	S SK
Js  Jr  S SKr	S SK	Jr  S SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)  \#" 5       (       a  S SK*J+r+  S SK,J-r-  S SK.J/r/  O\0r-\$Rb                  " \25      r3 " S S\5      r4  SPS\	Rj                  S\	Rj                  S\\	Rj                     S\\	Rj                     S\\	Rj                  \	Rj                  \	Rj                  \6\\	Rj                     \\	Rj                     4   4
S jjr7S\	Rj                  S\	Rj                  S\6S\6S\	Rj                  4
S jr8 " S S \	Rr                  Rt                  5      r;  SPS!\\	Rj                     S"\\6   4S# jjr< " S$ S%\-5      r= " S& S'\R|                  5      r? " S( S)\R|                  5      r@ " S* S+\(5      rA SQS,S-S.\	Rj                  S\	Rj                  S/\	Rj                  S\\	R                     S0\\6\64   S1\6S2\6S3\\C   S\\\	Rj                  \	Rj                  4   \\	Rj                     4   4S4 jjrD\	R                  4S,S-S.\	Rj                  S5\=S!\	Rj                  S"\6S0\\6\64   S1\6S2\6S6\	R                  S\\	Rj                     4S7 jjrGS,S-S.\	Rj                  S\	Rj                  S/\	Rj                  S\\	R                     S0\\6\64   S1\6S2\6S\\	Rj                     4S8 jrH\G\D\HS9.rI " S: S-\R|                  5      rJ " S; S<\R|                  5      rK\" " S= S>\ 5      5       rL\" " S? S@\L5      5       rM " SA SB\R|                  5      rN\"" SCSD9 " SE SF\L5      5       rO\"" SGSD9 " SH SI\L5      5       rP\"" SJSD9 " SK SL\L5      5       rQ\" " SM SN\L5      5       rR/ SOQrSg)R    N)nullcontext)DictLiteralOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)PretrainedConfig)_prepare_4d_attention_mask)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )GemmaRotaryEmbeddingapply_rotary_pos_emb) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                      ^  \ rS rSrSrSrS/r                                   S
S\S   4U 4S jjjrU 4S jr	S	r
U =r$ )ModernBertConfig5   a  
This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the ModernBERT-base.
e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 50368):
        Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`ModernBertModel`]
    hidden_size (`int`, *optional*, defaults to 768):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 1152):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 22):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer decoder.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
        if not specified.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
        The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    norm_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the normalization layers.
    pad_token_id (`int`, *optional*, defaults to 50283):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 50282):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 50281):
        Beginning of stream token id.
    cls_token_id (`int`, *optional*, defaults to 50281):
        Classification token id.
    sep_token_id (`int`, *optional*, defaults to 50282):
        Separation token id.
    global_rope_theta (`float`, *optional*, defaults to 160000.0):
        The base period of the global RoPE embeddings.
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    global_attn_every_n_layers (`int`, *optional*, defaults to 3):
        The number of layers between global attention layers.
    local_attention (`int`, *optional*, defaults to 128):
        The window size for local attention.
    local_rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the local RoPE embeddings.
    embedding_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the embeddings.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the MLP layers.
    mlp_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the MLP layers.
    decoder_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the decoder layers.
    classifier_pooling (`str`, *optional*, defaults to `"cls"`):
        The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
        CLS token doesn't attend to all tokens on long sequences.
    classifier_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the classifier.
    classifier_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the classifier.
    classifier_activation (`str`, *optional*, defaults to `"gelu"`):
        The activation function for the classifier.
    deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
        Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
    sparse_prediction (`bool`, *optional*, defaults to `False`):
        Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
    sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
        The index to ignore for the sparse prediction.
    reference_compile (`bool`, *optional*):
        Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
        the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
        shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
        be faster in some scenarios.
    repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
        When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
        applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

Examples:

```python
>>> from transformers import ModernBertModel, ModernBertConfig

>>> # Initializing a ModernBert style configuration
>>> configuration = ModernBertConfig()

>>> # Initializing a model from the modernbert-base style configuration
>>> model = ModernBertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
modernbertpast_key_valuesclassifier_poolingclsmeanc$           	        > [         T%U ]  " SUUUUUS.U$D6  Xl        Xpl        X l        X0l        X@l        XPl        Xl        Xl	        Xl
        Xl        UU l        UU l        UU l        X`l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U U l        U!U l        U"U l        U#U l        U R.                  S;  a  [A        SU R.                   S35      eg )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr'   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is . )!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasglobal_rope_thetaattention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr&   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)&selfr4   r6   r7   r8   r9   rA   r5   r:   r;   r<   r=   r+   r-   r,   r.   r/   r>   r?   r@   rB   rC   rD   rE   rF   rG   rH   r&   rI   rJ   rK   rL   rM   rN   rO   rP   kwargs	__class__s&                                        i/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/modernbert/modular_modernbert.pyr3   ModernBertConfig.__init__   s>   N 	 	
%%%%%	
 	
 %'>$&!2!2#6 !2)B& "!2,!2!2*D'. 0!2 &("4"4.%:"(@%!2(@%!2&<#""/9cdhd{d{c||}~  :    c                 H   > [         TU ]  5       nUR                  SS 5        U$ )NrO   )r2   to_dictpop)rR   outputrT   s     rU   rY   ModernBertConfig.to_dict   s#    "

&-rW   )r?   r@   rK   rJ   rI   r&   rH   rL   rE   rB   r>   rA   r6   r;   r:   r7   rC   rD   r5   rF   rG   r=   r<   r9   r8   rO   rP   rN   rM   r4   )#i  i   i        gelui    g{Gz?       @gh㈵>Fik  j  i  rb   ra   g     AF        r           @rc   Frc   Tr(   rc   Fr_   FFiNF)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencer   r3   rY   __static_attributes____classcell__rT   s   @rU   r"   r"   5   s    eN J#4"5   $"%"#$ 5:$!&!%$IQ8 $M29Q Qf rW   r"   inputsattention_maskposition_idslabelsreturnc                    UR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       n[        UR                  5       R                  5       5      n[        R                  R                  R                  [        R                  " US[        R                  S9S5      nU R                  5       S:X  a  U R	                  5       U   nO(U R                  tpnX-  nU R                  " U/UQ76 U   nUb  UR	                  5       U   OSnUb  UR	                  5       U   OSnXXvX4$ )	aP  
Remove padding from input sequences.

Args:
    inputs: (batch, seqlen, ...) or (batch, seqlen)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    position_ids: (batch, seqlen), int, position ids
    labels: (batch, seqlen), int, labels

Returns:
    unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    cu_seqlens: (batch + 1), the cumulative sequence lengths
    max_seqlen_in_batch: int
    unpadded_position_ids: (total_nnz) or None
    unpadded_labels: (total_nnz) or None
dimdtypeF)as_tupler   )   r   r   N)sumtorchint32nonzeroflattenintmaxitemr	   
functionalpadcumsumrx   shapeview)rp   rq   rr   rs   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlensunpadded_inputsbatchseqlenrestr   unpadded_position_idsunpadded_labelss                  rU   _unpad_modernbert_inputr      s   . &))b)DmmN224uEMMOG.22499;<$$((6FAUZU`U`)acijJzz|q ..*73%|| ++e3d3G<?K?WL0027;]a393Efnn&w/4OZF[llrW   r   r   r   c                 ^   U R                  5       S:X  aC  [        R                  " X#-  U R                  U R                  S9nXU'   UR                  X#5      nU$ U R                  tpg[        R                  " X#-  /UQ7U R                  U R                  S.6nXU'   UR
                  " X#/UQ76 nU$ )a-  
Add padding to sequences.

Args:
    inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    batch: int, batch size
    seqlen: int, max sequence length

Returns:
    padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
r{   )ry   device)rx   r}   zerosry   r   r   r   )rp   r   r   r   r[   padded_inputs_r   s           rU   _pad_modernbert_outputr   "  s    $ zz|qU^6<<V wE2  <<U^]d]&,,v}}] wE9D9rW   c                   h    \ rS rSr\  SS\\R                     S\\   4S jj5       r	\S 5       r
Srg)	ApplyRotaryEmbUnpadiA  Nr   
max_seqlenc                     UR                  5       nUR                  u  pgpUS S 2S S24   R                  USU	5      n
[        U
UUSUUSSS9  U R	                  X#U5        XPl        U$ )Nr   rv   r   FT)seqlen_offsetsr   r   interleavedinplace)
contiguousr   r   r    save_for_backwardr   )ctxqkvcossinr   r   	total_nnz_three_nheadsheaddimqks              rU   forwardApplyRotaryEmbUnpad.forwardB  sz     nn.1ii+	7 BQBZ__YG4!!		
 	c
3#
rW   c                     U R                   u  p#nUR                  5       nUR                  u  pVpxUS S 2S S24   R                  USU5      n	[	        U	UUSUU R
                  SSSS9	  US S S S S S 4$ )Nr   rv   r   FT)r   r   r   r   r   	conjugate)saved_tensorsr   r   r   r    r   )
r   dor   r   r   r   r   r   r   dqks
             rU   backwardApplyRotaryEmbUnpad.backwarda  s    "00*]]_.0hh+	7 BQBinnYG4!~~
	
 4tT455rW   r1   NN)rf   rg   rh   ri   staticmethodr   r}   Tensorr   r   r   rm   r1   rW   rU   r   r   A  sQ     .2$(
 U\\* SM < 6 6rW   r   r   r   c                 0    [         R                  XX#U5      $ )a  
Arguments:
    qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
    cos, sin: (seqlen_rotary, rotary_dim / 2)
    interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
        of 1st half and 2nd half (GPT-NeoX style).
    inplace: if True, apply rotary embedding in-place.
    seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
        Most commonly used in inference when we have KV cache.
    cu_seqlens: (batch + 1,) or None
    max_seqlen: int
Return:
    out: (total_nnz, dim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
)r   apply)r   r   r   r   r   s        rU   apply_rotary_unpaddedr   x  s    . $$Ss
KKrW   c                   6  ^  \ rS rSrSr    SS\S\S\\   S\\R                     S\\R                     4
U 4S jjjr SS	\R                  S
\R                  S\\   S\\R                  \\R                  \R                  4   4   4S jjrS\4S jrSrU =r$ )!ModernBertUnpaddedRotaryEmbeddingi  zH
The rotary position embeddings applied directly to unpadded sequences.
rx   baser   r   ry   c                 j   > [         TU ]  XSUSS9  X0l        Ub  Ub  Ub  U R                  X4US9  gggg)z
max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
    up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
    the cos_sin_cache will be recomputed during the forward pass.
TF)rx   r   pos_idx_in_fp32r   r   Nr   ry   )r2   r3   r   _update_cos_sin_cache)rR   rx   r   r   r   ry   rT   s         rU   r3   *ModernBertUnpaddedRotaryEmbedding.__init__  sP     	ST&^cd$!f&8U=N&&z&N >O&8!rW   r   r   rt   c                     Ub$  U R                  X1R                  UR                  S9  [        UU R                  U R
                  UUS9nU$ )z
Apply rotary embedding *inplace* to qkv.
qkv: (total_nnz, 3, nheads, headdim)
cu_seqlens: (batch + 1,) cumulative sequence lengths
max_seqlen: int max seq length in the batch
r   r   r   )r   r   ry   r   _cos_cached_sin_cached)rR   r   r   r   s       rU   r   )ModernBertUnpaddedRotaryEmbedding.forward  sQ     !&&z**CII&V#!!
 
rW   c                 T    SU R                    SU R                   SU R                   3$ )Nzdim=z, base=z, scale_base=)rx   r   
scale_baserR   s    rU   
extra_repr,ModernBertUnpaddedRotaryEmbedding.extra_repr  s(    dhhZwtyykt>OPPrW   )r   )re   NNNN)rf   rg   rh   ri   rj   r   floatr   r}   r   ry   r3   r   r   r   r   strr   rm   rn   ro   s   @rU   r   r     s     $()-'+OO O SM	O
 &O $O O. %)	\\ LL SM	
 
u||U5<<#=>>	?2QC Q QrW   r   c                      ^  \ rS rSrSrS\4U 4S jjr\R                  " SS9S\R                  S\R                  4S	 j5       r SS\\R                     S
\\R                     S\R                  4S jjrSrU =r$ )ModernBertEmbeddingsi  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
configc                 \  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)padding_idxepsbias)r2   r3   r   r	   	Embeddingr4   r6   r+   tok_embeddings	LayerNormr<   r=   normDropoutrE   droprR   r   rT   s     rU   r3   ModernBertEmbeddings.__init__  su     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	rW   Tdynamic	input_idsrt   c                 `    U R                  U R                  U R                  U5      5      5      $ r   )r   r   r   )rR   r   s     rU   compiled_embeddings(ModernBertEmbeddings.compiled_embeddings  s%    yy4#6#6y#ABCCrW   inputs_embedsc                    Ub"  U R                  U R                  U5      5      nU$ U R                  R                  (       a  U R	                  U5      O.U R                  U R                  U R                  U5      5      5      nU$ r   )r   r   r   rO   r   r   )rR   r   r   hidden_statess       rU   r   ModernBertEmbeddings.forward  su     $ IIdii&>?M  ;;00 ((3YYtyy)<)<Y)GHI 
 rW   )r   r   r   r   r   )rf   rg   rh   ri   rj   r"   r3   r}   compile
LongTensorr   r   r   r   rm   rn   ro   s   @rU   r   r     s    9/ 9 ]]4 DU-=-= D%,, D !D ei!%"2"23KSTYT`T`Ka	 rW   r   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ModernBertMLPi  a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
r   c                   > [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )Nr   r   )r2   r3   r   r	   Linearr6   r   r7   rF   Wir   rA   actr   rG   r   Wor   s     rU   r3   ModernBertMLP.__init__  s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_rW   r   rt   c                     U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )Nr   rv   rx   )r   chunkr   r   r   )rR   r   inputgates       rU   r   ModernBertMLP.forward  sG    ggm,221"2=wwtyy%4!7899rW   )r   r   r   r   r   )rf   rg   rh   ri   rj   r"   r3   r}   r   r   rm   rn   ro   s   @rU   r   r     s7    `/ `:U\\ :ell : :rW   r   c            
       Z   ^  \ rS rSrSS\S\S\S\\R                     4U 4S jjjr
SrU =r$ )	ModernBertRotaryEmbeddingi  r   rx   r   r   c                 R   > [         TU ]  XUS9  U R                  S XBUS9u  oPl        g )N)r   r   )rx   r   )r2   r3   rope_init_fnattention_scaling)rR   r   rx   r   r   inv_freqrT   s         rU   r3   "ModernBertRotaryEmbedding.__init__  s3    V<+/+<+<T6Y]+<+^((rW   )r   r   )rf   rg   rh   ri   r"   r   r   r   r}   r   r3   rm   rn   ro   s   @rU   r   r     s=    _/ _c _ _PXY^YeYePf _ _rW   r   moduleModernBertAttentionr   sliding_window_maskrC   bsrx   output_attentionsc	                    U R                  XS9u  pUR                  SS5      R                  SS9u  pn[        XX5      u  pU R                  S-  n[
        R                  " XR                  SS5      5      U-  nUS:w  a  UnUU-   n[        R                  R                  US[
        R                  S	9R                  UR                  5      n[        R                  R                  UU R                  U R                  S
9n[
        R                  " UU5      nUR                  SS5      R!                  5       nUR#                  USU5      nU(       a  UU4$ U4$ )Nrr   r   r{   r   r         ࿩rv   rv   rv   rw   )ptraining)
rotary_emb	transposeunbindr   head_dimr}   matmulr	   r   softmaxfloat32tory   dropoutr@   r  r   r   )r  r   rq   r  rr   rC   r  rx   r  _kwargsr   r   querykeyvaluescaleattn_weightsattn_outputs                     rU   eager_attention_forwardr    s=       @HCa+22q29E%e#;JEOOT!E<<}}Q':;eCL(",.0L ==((2U]](SVVW\WbWbcL==((9Q9Q\b\k\k(lL,,|U3K''1-88:K""2r3/K\**>rW   r  target_dtypec	           	         U" XUS9nUR                   [        R                  [        R                  4;  n
U
(       ad  UR                   nUR	                  U5      n[        UUUU R                  (       a  U R                  OSU R                  US9nUR	                  U5      nO5[        UUUU R                  (       a  U R                  OSU R                  US9nUR                  Xg5      4$ )Nr   rc   )r   r   	dropout_pdeterministicwindow_size)
ry   r}   float16bfloat16r  r   r  r@   rL   r   )r  r   r  r   r   rC   r  rx   r  r  convert_dtype
orig_dtypeattns                rU   flash_attention_forwardr(  $  s     SJ
GCIIemmU^^%DDM YY
ff\"/!!28//f..s 99'
 wwz"/!!28//f..s 99'
 IIb  rW   c                 f   U R                  XS9u  pUR                  SS5      R                  SS9u  pn[        XX5      u  pUS:w  a  Un[        R
                  " UUUU R                  (       a  U R                  OSUS9R                  SS5      R                  5       nUR                  US	U5      nU4$ )
Nr  r   r{   r   r   r
  rc   )r   	attn_maskrv   )
r  r  r  r   Fscaled_dot_product_attentionr  r@   r   r   )r  r   rq   r  rr   rC   r  rx   r  r   r   r  r  r  r  s                  rU   sdpa_attention_forwardr-  O  s        @HCa+22q29E%e#;JE(", 	
&&28//f..s$	
 
1a	  ""2r3/K>rW   )flash_attention_2eagersdpac                      ^  \ rS rSrSrSS\S\\   4U 4S jjjr SS\	R                  S\\   S\	R                  4S	 jjrS
rU =r$ )r  iy  an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
r   layer_idc                   > [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR
                  U l	        UR                  UR
                  -  U l
        U R                  U R                  -  U l        [        R                  " UR                  SU R                  -  UR                  S9U l        X!R                   -  S:w  a$  UR"                  S-  UR"                  S-  4U l        OSU l        UR$                  nUR&                  nU R"                  S:w  a%  UR(                  b  UR(                  nUR"                  nUR*                  S	:X  a  [-        U R                  XCS
9U l        O[1        XR                  US9U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S:  a   [        R4                  " UR                  5      O[        R6                  " 5       U l        [;        5       U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r   r
  r.  )rx   r   r   )r   rx   r   rc   )r2   r3   r   r2  r6   r9   rQ   r@   rL   	num_headsr  all_head_sizer	   r   r?   WqkvrB   rC   r>   r5   rD   _attn_implementationr   r  r   r   r   Identityout_dropsetpruned_heads)rR   r   r2  
rope_thetar5   rT   s        rU   r3   ModernBertAttention.__init__  s     : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%33**f.H.HH!]]T^^;IIf00!d6H6H2HvOdOde	7771<$*$:$:a$?AWAW[\A\#]D #+D --
"("@"@8+&&2#44
&,&<&<#&&*==?MM.EDO 8v==_ijDO))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqErW   r   r  rt   c           
         U R                  U5      nUR                  S   nU R                  R                  S:X  a)  UR	                  SSU R
                  U R                  5      nO)UR	                  USSU R
                  U R                  5      n[        U R                  R                     " U 4UU R                  U R                  UU R                  US.UD6nUS   nU R                  U R                  U5      5      nU4USS  -   $ )Nr   r.  rv   r   )r   r  rC   r  rx   r  r{   )r7  r   r   r8  r   r5  r  MODERNBERT_ATTENTION_FUNCTIONr  rC   r6  r:  r   )rR   r   r  rS   r   r  attn_outputss          rU   r   ModernBertAttention.forward  s     ii&  #;;++/BB((2q$..$--@C((2r1dnndmmDC4T[[5U5UV	
 00""/	
 	
 %Qdggm&<=,qr"222rW   )r   r7  r6  r@   r   rL   r  r2  rC   r5  r:  r<  r  r   F)rf   rg   rh   ri   rj   r"   r   r   r3   r}   r   boolr   rm   rn   ro   s   @rU   r  r  y  s]    &"/ &"8C= &" &"V -23||3 $D>3
 
3 3rW   c                   t  ^  \ rS rSrSS\S\\   4U 4S jjjr\R                  " SS9S\R                  S\R                  4S	 j5       r      SS\R                  S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\R                  4S jjrSrU =r$ )ModernBertEncoderLayeri  r   r2  c                   > [         TU ]  5         Xl        US:X  a  [        R                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        g )Nr   r   )r   r2  )r2   r3   r   r	   r9  	attn_normr   r6   r<   r=   r  r'  mlp_normr   mlprR   r   r2  rT   s      rU   r3   ModernBertEncoderLayer.__init__  s    q=[[]DN\\&*<*<&//X^XhXhiDN'vI	V%7%7V__SYScScd (rW   Tr   r   rt   c                 B    U R                  U R                  U5      5      $ r   )rJ  rI  rR   r   s     rU   compiled_mlp#ModernBertEncoderLayer.compiled_mlp  s    xxm455rW   rq   r  rr   r   r   r  c           
      
   U R                  U R                  U5      UUUUUUS9nXS   -   nU R                  R                  (       a  U R	                  U5      OU R                  U R                  U5      5      n	X-   nU4USS  -   $ )Nrq   r  rr   r   r   r  r   r{   )r'  rH  r   rO   rO  rJ  rI  )
rR   r   rq   r  rr   r   r   r  rA  
mlp_outputs
             rU   r   ModernBertEncoderLayer.forward  s     yyNN=)) 3%!!/ ! 
 &Q7 {{,, m,$--67 	
 &2,qr"222rW   )r'  rH  r   rJ  rI  r   )NNNNNF)rf   rg   rh   ri   r"   r   r   r3   r}   r   r   rO  r   rD  r   rm   rn   ro   s   @rU   rF  rF    s    	)/ 	)8C= 	) 	) ]]4 6%,, 65<< 6 !6 266:37-1$(,13||3 !.3 &ell3	3
 u//03 U\\*3 SM3 $D>3 
3 3rW   rF  c                      ^  \ rS rSr\rSrSrSS/rSr	Sr
SrS\R                  4S jr\    SS	\S
\\R&                     S\\\\\\4   4      S\4U 4S jjj5       rS rU 4S jrSrU =r$ )ModernBertPreTrainedModeli  modelTr   rF  Fr  c                   ^ U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        45      (       a  U" UR2                  US   5        g [        U[        R4                  5      (       aX  UR6                  R8                  R;                  S5        UR<                  b%  UR<                  R8                  R?                  5         g g g )Nr   r  stdc                   > [         R                  R                  U R                  SUT* U-  TU-  S9  [	        U [         R
                  5      (       a8  U R                  b*  [         R                  R                  U R                  5        g g g )Nrc   )r)   rY  ab)r	   inittrunc_normal_weight
isinstancer   r   zeros_)r  rY  cutoff_factors     rU   init_weight<ModernBertPreTrainedModel._init_weights.<locals>.init_weight  st    GG!! .3&#% "  &")),,;;*GGNN6;;/ + -rW   r`   r	  )inout	embedding	final_outrg  re  rf  rh  g      ?) r   r;   r	   Moduler   r:   mathsqrtr8   r6   r`  r   r   r   r   r   r  r7  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassification ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r_  datafill_r   zero_)rR   r  rc  stdsrb  s       @rU   _init_weights'ModernBertPreTrainedModel._init_weights  s   == M	0		 	0 	0 ++//;;00499S4;;C`C`=`3aa6600$6	
 f233--tK/@A..		4:.		4;/ 344T$Z0		4;/ 899d5k2 566U402RTrs
 
 ))4+<=--MM$$S){{&  &&( ' .rW   use_flash_attention_2torch_dtype
device_mapcheck_device_mapc                    > UR                   c)  SUl          U R                  U[        R                  USUS9$ [        TU ]  UU[        R                  UUS9$ ! [        [
        4 a
    S Ul          N8f = f)Nr.  F)r{  r|  hard_check_onlyr}  )rz  r{  r|  r}  )_attn_implementation_internal_check_and_enable_flash_attn_2r}   r#  rQ   ImportErrorr2   _autoset_attn_implementation)r(   r   rz  r{  r|  r}  rT   s         rU   r  6ModernBertPreTrainedModel._autoset_attn_implementation0  s     //73FF0	<99 %)$)%5 :   w3"7!- 4 
 	
 , <7;4<s    A A10A1c                    U R                   R                  SL a  g [        U S5      (       aZ  [        U R                  5      S:  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                   R                  c  [        5       U R                   l        g g )	NFhf_device_mapr{   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.mpsz|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.cpuz|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
r   rO   hasattrlenr  loggerwarning_oncer   typer   r   s    rU   _maybe_set_compile,ModernBertPreTrainedModel._maybe_set_compileQ  s   ;;((E14))c$2D2D.E.I{{,,##9 -2DKK);;u${{,,##9 -2DKK);;u${{,,##9 -2DKK);;((0,?,ADKK) 1rW   c                    > [         TU ]  " U0 UD6nU R                  R                  S;   aA  U R                  R                  (       a  [        R                  S5        SU R                  l        U$ )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)r2   resize_token_embeddingsr   rO   r  r  )rR   argsrS   model_embedsrT   s       rU   r  1ModernBertPreTrainedModel.resize_token_embeddingsp  s[    w6GG;;((L8{{,,##y -2DKK)rW   r1   )FNNT)rf   rg   rh   ri   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_flex_attnr	   ri  rx  classmethodrD  r   r}   ry   r   r   r   r   r  r  r  rm   rn   ro   s   @rU   rV  rV    s    #L&*#/1IJ!N-)BII -)^  ',-1;?!%
  $
 ekk*	

 U3S#X#678
 
 
@B>
 
rW   rV  c            !         ^  \ rS rSrS\4U 4S jjrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\   S\\\
R                  S4   \4   4S jj5       rS\
R                  S\S\
R                  4S jrSrU =r$ )ModernBertModeli}  r   c           	        > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        SU l        U R#                  5         g s  snf )Nr   F)r2   r3   r   r   
embeddingsr	   
ModuleListranger8   rF  layersr   r6   r<   r=   
final_normgradient_checkpointing	post_initrK  s      rU   r3   ModernBertModel.__init__  s     .v6mmFKFLdLdFefFe(#F5Fef
 ,,v'9'9vU[UeUef&+#	 gs   C c                 .    U R                   R                  $ r   r  r   r   s    rU   get_input_embeddings$ModernBertModel.get_input_embeddings  s    ---rW   c                 $    XR                   l        g r   r  )rR   r  s     rU   set_input_embeddings$ModernBertModel.set_input_embeddings  s    ).&rW   r   rq   r  rr   r   r   r   r   
batch_sizeseq_lenr  output_hidden_statesreturn_dictrt   .c                   ^^	^
 Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU(       a  SOSnU(       a  SOSnU R                  5         Ub  U R                  X5        T	c+  T
c(  Ub  UR                  SS u  m	m
OUR                  SS u  m	m
Ub  UR                  OUR                  nUc&  [        R                  " T	T
4U[        R                  S9nSnU R                   R                  S:X  aH  TcD  UcA  Uc>  SnUc,  [        R                  " 5          [        XS	9tnmpxnSSS5        OF[        XRS	9tnmpxnO8Uc$  [        R                  " T
US
9R!                  S5      nU R#                  X+S9u  p#U R%                  XS9nU R&                   H  nU(       a  UU4-   nU R(                  (       a4  U R*                  (       a#  U R-                  UR.                  UUUUUUU5      nOU" UUUUUUUS9nUS   nU(       d  Mo  [1        U5      S:  d  M  UUS   4-   nM     U(       a  UU4-   nU R3                  U5      nU(       a&  [5        UTT	T
S9nUb  [7        U	UU
4S jU 5       5      nU(       d  [7        S UX4 5       5      $ [9        UUUS9$ ! , (       d  f       GN(= f)F  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nz:You must specify exactly one of input_ids or inputs_embedsr1   r   r   Fr.  T)rp   rq   )r   r   )r  )r   r   rR  r{   rp   r   r   r   c              3   <   >#    U  H  n[        UTTTS 9v   M     g7f)r  N)r   ).0hsr  r   r  s     rU   	<genexpr>*ModernBertModel.forward.<locals>.<genexpr>
  s$      */ +"gZ`gh/s   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r1   )r  vs     rU   r  r    s     m$[q$[s   	)last_hidden_stater   
attentions)r   r  r  use_return_dictrQ   r  %warn_if_padding_and_no_attention_maskr   r   r}   onesrD  r8  no_gradr   arange	unsqueeze_update_attention_maskr  r  r  r  _gradient_checkpointing_func__call__r  r  r   tupler   )rR   r   rq   r  rr   r   r   r   r   r  r  r  r  r  all_hidden_statesall_self_attentionsr   repadr   r   encoder_layerlayer_outputss         `  ``           rU   r   ModernBertModel.forward  s%   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ"6BD$5b4! 66yQ'/(&3&9&9"1&=#
G&/oobq&9#
G%.%:!!@T@T!"ZZW(=fTYT^T^_N;;++/BB:#5*:L (I`#,JF	7JQ )
 Ja,JFM7JQ #$||GFCMMaP262M2M 3N 3/N )Y![[M#$58H$H!**t}} $ A A!**!"' %	! !.!#1(;!-))&7! *!,M  S%7!%;&9]1=M<O&O#7 ):   1]4D D62$gZPWM !,$) */* %!
 m]4E$[mmm++*
 	
A )s   K
Kc                 ,   U(       a  U R                   R                  S:X  a'  [        R                  S5        SU R                   l        OGU R                   R                  S:w  a-  [        R                  SU R                   R                   S35        [	        XR
                  5      n[        R                  " UR                  S   5      R                  S5      n[        R                  " XDR                  -
  5      nXPR                   R                  S-  :*  R                  S5      R                  S5      R                  UR                  5      nUR                  UR!                  5       [        R"                  " U R
                  5      R$                  5      nX74$ )Nr0  zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r/  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r   r   )r   r8  r  r  r   ry   r}   r  r   r  absTrC   r  r   masked_filllogical_notfinfomin)rR   rq   r  global_attention_maskrowsdistancewindow_maskr  s           rU   r  &ModernBertModel._update_attention_mask  sJ   {{//69##V 4;011W<##  $ @ @A B:: !;>:: V ||177:;EEaH99TFF]+ 4499DDQGQQRSTWWXfXmXmn 	 4??@W@W@Y[`[f[fgkgqgq[r[v[vw$99rW   )r   r  r  r  r  NNNNNNNNNNNNN)rf   rg   rh   ri   r"   r3   r  r  r   r   r}   r   r   r   rD  r   r   r   r   r  rm   rn   ro   s   @rU   r  r  }  s   	/ 	./  15156:3704*.-1$($(!%,0/3&*D
E,,-D
 !.D
 &ell3	D

 u//0D
  -D
 %,,'D
 U\\*D
 SMD
 SMD
 #D
 $D>D
 'tnD
 d^D
 
uU\\3&'8	9D
 D
L:U\\ :VZ :_d_k_k : :rW   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )rl  i7  r   c                 F  > [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nr   )r2   r3   r   r	   r   r6   rJ   rm  r   rK   r   r   r<   r=   r   r   s     rU   r3   !ModernBertPredictionHead.__init__8  so    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	rW   r   rt   c                 `    U R                  U R                  U R                  U5      5      5      $ r   )r   r   rm  rN  s     rU   r    ModernBertPredictionHead.forward?  s#    yy$**]";<==rW   )r   r   rm  r   )rf   rg   rh   ri   r"   r3   r}   r   r   rm   rn   ro   s   @rU   rl  rl  7  s2    a/ a>U\\ >ell > >rW   rl  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc            "       F  ^  \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\R                  " SS	9S
\R                  S\R                  4S j5       r\              SS\\R"                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rn  iC  zdecoder.weightr   c                 n  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )Nr   )r2   r3   r   r  rW  rl  headr	   r   r6   r4   rH   ro  rM   rN   r  r   s     rU   r3   ModernBertForMaskedLM.__init__K  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	rW   c                     U R                   $ r   ro  r   s    rU   get_output_embeddings+ModernBertForMaskedLM.get_output_embeddingsX  s    ||rW   new_embeddingsc                     Xl         g r   r  )rR   r  s     rU   set_output_embeddings+ModernBertForMaskedLM.set_output_embeddings[  s    %rW   Tr   r[   rt   c                 B    U R                  U R                  U5      5      $ r   )ro  r  )rR   r[   s     rU   compiled_head#ModernBertForMaskedLM.compiled_head^  s    ||DIIf-..rW   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  c                 `   Ub  UOU R                   R                  nU R                  5         U R                   R                  S:X  a  Uc  Uc  U	c  U
c)  Uc&  Ub  UR                  SS u  pOUR                  SS u  pUb  UR
                  OUR
                  nUc%  [        R                  " X4U[        R                  S9nUc-  [        R                  " 5          [        XXFS9u  pppFSSS5        O[        XRXFS9u  pWppFU R                  UUUUUUUU	U
UUUUS9nUS   nU R                  (       aK  UbH  UR                  S5      nUR                  UR                  S   S5      nX`R                  :g  nUU   nUU   nU R                   R                  (       a  U R!                  U5      OU R#                  U R%                  U5      5      nSnUb$  U R'                  UX`R                   R(                  S	9nU R                   R                  S:X  aQ  U R                   R*                  (       d  Uc
  [-        5       O[        R                  " 5          [/        UXzUS
9nSSS5        U(       d  U4nUb  U4U-   $ U$ [1        UUUR2                  UR4                  S9$ ! , (       d  f       GN= f! , (       d  f       NU= f)r  Nr.  r   r   )rp   rq   rr   rs   r   rq   r  rr   r   r   r   r   r  r  r  r  r  r   rv   )r4   r  losslogitsr   r  )r   r  r  r8  r   r   r}   r  rD  r  r   rW  rM   r   rN   rO   r  ro  r  loss_functionr4   rP   r   r   r   r   r  )rR   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  rS   r   outputsr  mask_tokensr  r  r[   s                          rU   r   ModernBertForMaskedLM.forwardb  s   F &1%<k$++B]B]!;;++/BB:#5*:L%'/$0.;.A.A"1.E+
G.7oobq.A+
-6-B))H\H\!)%*ZZ0Ef\a\f\f%gN ([r#,Zf\X	JL )
 \s,Zf\XMJL **) 3%'!!!/!5#  
 $AJ!!f&8[[_F 1 6 6v||A K !$A$AAK 1+ >K(F {{,, 01dii(9:; 	 %%ffAWAW%XD;;++/BB"&++"D"D\a\i\i\kk/vwipq l YF)-)9TGf$EvE!//))	
 	
m )^ lks   JJ
J
J-)r   ro  r  rW  rN   rM   NNNNNNNNNNNNNN)rf   rg   rh   ri   _tied_weights_keysr"   r3   r  r	   r   r  r}   r   r   r  r   r   r   r   rD  r   r   r   r   rm   rn   ro   s   @rU   rn  rn  C  s    ++/ &BII & ]]4 /ELL /U\\ / !/  15156:/304)-*.-1$($(!%,0/3&*m
E,,-m
 !.m
 &ell3	m

 u||,m
  -m
 &m
 %,,'m
 U\\*m
 SMm
 SMm
 #m
 $D>m
 'tnm
 d^m
" 
uU\\"N2	3#m
 m
rW   rn  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "         ^  \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rp  i  r   c                 n  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g r   )r2   r3   
num_labelsr   r  rW  rl  r  r}   r	   r   rI   r   r   r6   rs  r  r   s     rU   r3   ,ModernBertForSequenceClassification.__init__  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	rW   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  rt   c                 t   Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  SS9UR                  SS	S
9-  nU R                  U5      nU R                  U5      nU R                  U5      nSnUGb  U R                   R                  c  U R                  S:X  a  SU R                   l
        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l
        OSU R                   l
        U R                   R                  S:X  aJ  [!        5       nU R                  S:X  a&  U" UR#                  5       UR#                  5       5      nOU" UU5      nOU R                   R                  S:X  a=  [%        5       nU" UR'                  SU R                  5      UR'                  S5      5      nO-U R                   R                  S:X  a  [)        5       nU" UU5      nU(       d  U4nUb  U4U-   $ U$ [+        UUUR,                  UR.                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr  r   r(   r)   rv   r{   r   T)rx   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )r   r  r  rW  r&   r  r|   r  r   rs  problem_typer  ry   r}   longr   r   squeezer   r   r
   r   r   r  )rR   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  rS   r  r  pooled_outputr  r  loss_fctr[   s                          rU   r   +ModernBertForSequenceClassification.forward  s   N &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./YF)-)9TGf$EvE'!//))	
 	
rW   )rs  r   r   r  rW  r  r  )rf   rg   rh   ri   r"   r3   r   r   r}   r   r   r   rD  r   r   r   r   rm   rn   ro   s   @rU   rp  rp    sk   /   15156:/304)-*.-1$($(!%,0/3&*e
E,,-e
 !.e
 &ell3	e

 u||,e
  -e
 &e
 %,,'e
 U\\*e
 SMe
 SMe
 #e
 $D>e
 'tne
 d^e
" 
uU\\"$<<	=#e
 e
rW   rp  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "         ^  \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rq  iO  r   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g r   r2   r3   r  r  rW  rl  r  r}   r	   r   rI   r   r   r6   rs  r  r   s     rU   r3   )ModernBertForTokenClassification.__init__U  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	rW   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  rt   c                    Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr  r   rv   r{   r  )r   r  r  rW  r  r   rs  r   r   r  r   r   r  )rR   r   rq   r  rr   r   rs   r   r   r   r  r  r  r  r  r  r  r  r  r	  r[   s                        rU   r   (ModernBertForTokenClassification.forwarda  s#   H &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ II&78 II&78!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rW   rs  r   r  rW  r  r  )rf   rg   rh   ri   r"   r3   r   r   r}   r   r   r   rD  r   r   r   r   rm   rn   ro   s   @rU   rq  rq  O  sk   
/ 
  15156:/304)-*.-1$($(!%,0/3&*I
E,,-I
 !.I
 &ell3	I

 u||,I
  -I
 &I
 %,,'I
 U\\*I
 SMI
 SMI
 #I
 $D>I
 'tnI
 d^I
  
uU\\"$99	:!I
 I
rW   rq  c            "         ^  \ rS rSrS\4U 4S jjr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rr  i  r   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g r   r  r   s     rU   r3   'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJrW   r   rq   r  rr   start_positionsend_positionsr   r   r   r  r  r  r  r  rt   c                 R   Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nUR                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  U R                  " UUXV40 UD6nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )r  N)rq   r  rr   r   r   r   r  r  r  r  r  r   r{   rv   r   )r  start_logits
end_logitsr   r  )r   r  r  rW  r  r   rs  splitr  r   r  r   r   r  )rR   r   rq   r  rr   r  r  r   r   r   r  r  r  r  r  rS   r  r  r  r  r  r  r[   s                          rU   r   &ModernBertForQuestionAnswering.forward  sc   F &1%<k$++B]B]!**) 3%!!!/!5#  
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%%lJibhiD"J/'!"+=F)-)9TGf$EvE+%!!//))
 	
rW   r  r  )rf   rg   rh   ri   r"   r3   r   r   r}   r   r   rD  r   r   r   r   rm   rn   ro   s   @rU   rr  rr    sf   	/ 	  266:/32604*.-1$($(!%,0/3&*K
ELL)K
 !.K
 &ell3	K

 u||,K
 "%,,/K
  -K
 %,,'K
 U\\*K
 SMK
 SMK
 #K
 $D>K
 'tnK
 d^K
" 
uU\\"$@@	A#K
 K
rW   rr  )r"   r  rV  rn  rp  rq  rr  r   rC  )Trj  
contextlibr   typingr   r   r   r   r   r}   torch.nn.functionalr	   r   r+  torch.utils.checkpointtorch.nnr
   r   r   activationsr   configuration_utilsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   gemma.modeling_gemmar   r   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr    object
get_loggerrf   r  r"   r   r   r   r   autogradFunctionr   r   r   ri  r   r   r   r   rD  r  r$  ry   r(  r-  r@  r  rF  rV  r  rl  rn  rp  rq  rr  __all__r1   rW   rU   <module>r1     s     " 8 8      A A ! 3 B  . G G 5 M P89O 
		H	%A' AN ,0%)	&mLL&mLL&m 5<<(&m U\\"	&m
 5<<u||S(5<<:PRZ[`[g[gRhhi&mRLL\\  	
 \\>46%..11 46v *. $L &	L
 L42Q 2Qj299 <:BII :(_ 4 _ )."!"	" LL" 	"
 5++," 38_" 	" 
"  ~" 5u||+,eELL.AAB"\ !&(!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! ++(! 5<<(!V ! 	  LL  	 
 5++,  38_  	  
  5<< H 1$"! M3")) M3`+3RYY +3\ B B BJ v:/ v: v:r	>ryy 	> 
H
5 H

H
V 
t
*C t

t
n 
W
'@ W

W
t X
%> X
 X
vrW   