
    fTh                        S SK r S SKJrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJrJrJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-  \*" 5       (       a  S SK.J/r/  SSK0J1r1  \+Rd                  " \35      r4 " S S\Rj                  5      r6S r7S=S jr8S\Rr                  S\:S\Rr                  4S jr;S r< " S S\Rj                  5      r= " S  S!\=5      r> " S" S#\=5      r?\" S$5       " S% S&\Rj                  5      5       r@\=\>\?S'.rA " S( S)\5      rB\( " S* S+\#5      5       rC " S, S-\Rj                  5      rD\( " S. S/\C5      5       rE " S0 S1\\'5      rF\( " S2 S3\C\5      5       rG\(" S4S59 " S6 S7\C5      5       rH\( " S8 S9\C5      5       rI\( " S: S;\C5      5       rJ/ S<QrKg)>    N)OptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheStaticCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs_flash_attention_forward!flash_attn_supports_top_left_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )DiffLlamaConfig)	BlockMask)make_flex_block_causal_maskc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DiffLlamaMLP@   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)super__init__confighidden_sizeintermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnselfr.   	__class__s     h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/diffllama/modeling_diffllama.pyr-   DiffLlamaMLP.__init__A   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../    c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)r4   r6   r2   r3   )r8   xr4   s      r:   forwardDiffLlamaMLP.forwardK   s6    NN4;;t~~a/@#ADLLQRO#ST	r<   )r6   r.   r4   r2   r/   r0   r3   )__name__
__module____qualname____firstlineno__r-   r@   __static_attributes____classcell__r9   s   @r:   r&   r&   @   s    0 r<   r&   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)r?   x1x2s      r:   rotate_halfrS   P   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerS   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r:   apply_rotary_pos_embr^   W   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr<   hidden_statesn_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)rN   expandreshape)r_   r`   batchnum_key_value_headsslenhead_dims         r:   	repeat_kvri   r   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr<   c                 @    SS[         R                  " SU -  5      -  -
  $ )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r:   lambda_init_fnrn   ~   s     txxy 01111r<   c                   t  ^  \ rS rSrSrSS\S\\   4U 4S jjjr      SS\	R                  S\\	R                  \	R                  4   S\\	R                     S	\\	R                     S
\\   S\S\S\\	R                     S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )DiffLlamaAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr.   rm   c                   > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        [        USU R                  U R                  -  5      U l        UR                  U l        U R                  U R                  -  U l        UR                   U l        UR"                  U l        SU l        [&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  -  U R                  UR*                  S9U l        [5        U5      U l        [&        R8                  " [:        R<                  " SUR>                  U R                  4S95      U l         [&        R8                  " [:        R<                  " SUR>                  U R                  4S95      U l!        [&        R8                  " [:        R<                  " SUR>                  U R                  4S95      U l"        [&        R8                  " [:        R<                  " SUR>                  U R                  4S95      U l#        [&        RH                  " SU R                  -  URJ                  S	S
9U l&        g )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.rh   Tr*   r   )sizerK   F)epselementwise_affine)'r,   r-   r.   rm   loggerwarning_oncer9   rB   attention_dropoutr/   num_attention_heads	num_headsgetattrrh   rf   num_key_value_groupsmax_position_embeddings
rope_theta	is_causalr   r1   attention_biasq_projk_projv_projo_projrn   lambda_init	ParameterrO   normallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormr8   r.   rm   r9   s      r:   r-   DiffLlamaAttention.__init__   sz   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ ++ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir<   r_   position_embeddingsattention_maskrZ   past_key_valueoutput_attentions	use_cachecache_positionra   c	                 
   UR                  5       u  pnUnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb'  UUUS.nUR                  UUU R                  U5      u  nn[        XR                  5      n[        UU R                  5      n[        R                  " [        R                   " USSS9SS9nUR#                  SSSS5      n[        R$                  " XR                  SS5      5      [&        R(                  " U R                  5      -  nUb#  US S 2S S 2S S 2S UR*                  S   24   nUU-   n[,        R.                  R1                  US[        R2                  S9R5                  UR6                  5      n[,        R.                  R9                  UU R:                  U R<                  S	9n[        R>                  " [        R@                  " U RB                  U RD                  -  S[        R2                  S95      R5                  UR6                  5      n[        R>                  " [        R@                  " U RF                  U RH                  -  S[        R2                  S95      R5                  UR6                  5      nUU-
  U RJ                  -   n[        R$                  " UU5      n[        R                   " USSS9u  nnUUU-  -
  nSU RJ                  -
  U RM                  U5      -  nUR                  SS5      RO                  5       nURQ                  XS5      nU RS                  U5      nU(       d  S nUU4$ )
Nr!   rK   rY   rX   r   rL   rJ   r   rM   dtype)ptraining)*rs   r   r   r   viewrz   rh   	transposerf   r^   updaterm   ri   r|   rO   rP   chunkrepeatmatmulrk   sqrtrN   r   
functionalsoftmaxfloat32tor   dropoutrx   r   rl   sumr   r   r   r   r   r   
contiguousrd   r   )r8   r_   r   r   rZ   r   r   r   r   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesrX   rY   cache_kwargsattn_weightscausal_masklambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                               r:   r@   DiffLlamaAttention.forward   sr    +//1{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ %#&snUL'5'<'<ZW[WeWegs't$Jz+D+DE
 t/H/HIyy\1!!D"M#**1aA6||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIK'+5L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#b9kk+. LL((r<   )rx   r.   r   rh   r/   r   r   r   r   r   r   r   rm   r}   rz   r|   rf   r   r   r~   r   r>   NNNFFN)rB   rC   rD   rE   __doc__r"   r   intr-   rO   Tensorr   
LongTensorr	   boolr@   rF   rG   rH   s   @r:   rp   rp      s   G j  j8C=  j  jL 2637*."'59B)||B) #5<<#=>B) !.	B)
 u//0B) !B)  B) B) !!1!12B) 
u||Xell3XeELL>Q5RR	SB) B)r<   rp   c                   ^  ^  \ rS rSrSrU 4S jr      SS\R                  S\\R                  \R                  4   S\	\R                     S\	\R                     S\	\   S	\S
\S\	\R                     S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )DiffLlamaFlashAttention2   a>  
DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r>   )r,   r-   r   _flash_attn_uses_top_left_mask)r8   argsr   r9   s      r:   r-   !DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r<   r_   r   r   rZ   r   r   r   r   ra   c	                 	   [        U[        5      (       a  [        S5      eSnUR                  5       u  pnU R	                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUc*  [        R                  S5        U R                  X5      u  nnOUu  nn[        XUU5      u  pUb$  UXS.nUR!                  XU R"                  U5      u  pUR                  SS5      nUR                  SS5      nUR                  SS5      nU R$                  (       a  U R&                  OSnUR(                  nU[*        R,                  :X  a  [*        R.                  " 5       (       a  [*        R0                  " 5       nOR[3        U R4                  S5      (       a  U R4                  R6                  nO U R                  R8                  R(                  n[        R                  S	U S
35        UR;                  U5      nUR;                  U5      nUR;                  U5      n[*        R<                  " USSS9u  nnUR?                  SSSS5      nUR?                  SSSS5      n[A        UUUUU
UU[C        U SS 5      U RD                  U RF                  S9
n[A        UUUUU
UU[C        U SS 5      U RD                  U RF                  S9
n[*        RH                  " UU/SS9n[*        R<                  " USSS9u  nn[*        RJ                  " [*        RL                  " U RN                  U RP                  -  S[*        R,                  S95      R;                  UR(                  5      n[*        RJ                  " [*        RL                  " U RR                  U RT                  -  S[*        R,                  S95      R;                  UR(                  5      nUU-
  U RV                  -   nUUU-  -
  nSU RV                  -
  U RY                  U5      -  nUR[                  XS5      R]                  5       nU R_                  U5      nU(       d  S nUW4$ )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersFr!   rK   aY  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.r           _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rL   sliding_window)rZ   r   r   use_top_left_maskr   rJ   r   )0
isinstancer   
ValueErrorrs   r   r   r   r   rz   rh   r   rf   rv   rw   
rotary_embr^   r   rm   r   rx   r   rO   r   is_autocast_enabledget_autocast_gpu_dtypehasattrr.   r   weightr   r   r   r   r{   r   r   rP   rl   r   r   r   r   r   r   r   rd   r   r   )r8   r_   r   r   rZ   r   r   r   r   r   r   r   r   r   r   rX   rY   r   dropout_rateinput_dtypetarget_dtypevalue_states1value_states2r   r   r   r   r   r   r   s                                 r:   r@    DiffLlamaFlashAttention2.forward   s[    nk22} 
 "%**,A{{=1[[/
{{=1
 $((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&G |BHC*HC#7RUWZ#[ %#&sUL'5'<'<ZW[WeWegs't$J $--a3))!Q/
#--a315t--C #((%--'((**$;;=&?@@#{{BB#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#b9DDFkk+. LL((r<   )r   r   )rB   rC   rD   rE   r   r-   rO   r   r   r   r   r	   r   r@   rF   rG   rH   s   @r:   r   r      s    R 6:37*."'59D)||D) #5<<#=>D) !!1!12	D)
 u//0D) !D)  D) D) !!1!12D) 
u||Xell3XeELL>Q5RR	SD) D)r<   r   c                   X  ^  \ rS rSrSr      SS\R                  S\\R                  \R                  4   S\\R                     S\\R                     S\\
   S\S	\S
\\R                     S\\R                  \\R                     \\\R                        4   4U 4S jjjrSrU =r$ )DiffLlamaSdpaAttentioni  z
DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
r_   r   r   rZ   r   r   r   r   ra   c	                   > U(       a)  [         R                  S5        [        TU ]  UUUUUUUUS9$ UR	                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb%  UUUS.nUR                  XU R                  U5      u  p[!        XR"                  5      n[!        XR"                  5      n[$        R&                  " [$        R(                  " USSS9SS9nUR+                  SSSS5      nUnUb  US S 2S S 2S S 2S UR,                  S   24   nUR.                  R0                  S	:X  a3  Ub0  UR3                  5       nUR3                  5       nUR3                  5       nUc  US:  a  S
OSn[$        R4                  R6                  R9                  UUUUU R:                  (       a  U R<                  OSUS9n[$        R(                  " USSS9u  nn[$        R>                  " [$        R@                  " U RB                  U RD                  -  S[$        RF                  S95      RI                  URJ                  5      n[$        R>                  " [$        R@                  " U RL                  U RN                  -  S[$        RF                  S95      RI                  URJ                  5      nUU-
  U RP                  -   nUUU-  -
  nSU RP                  -
  U RS                  U5      -  nUR                  SS5      R3                  5       nUR                  XS5      nU RU                  U5      nUS 4$ )Na  DiffLlamaModel is using DiffLlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r_   r   rZ   r   r   r   r   r   r!   rK   r   rL   rJ   r   cudaTFr   )	attn_mask	dropout_pr   r   )+rv   rw   r,   r@   rs   r   r   r   r   rz   rh   r   rf   r^   r   rm   ri   r|   rO   rP   r   r   rN   devicetyper   r   r   scaled_dot_product_attentionr   rx   rl   r   r   r   r   r   r   r   r   r   r   r   )r8   r_   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r   rX   rY   r   r   r   r   r   r   r   r   r   r9   s                              r:   r@   DiffLlamaSdpaAttention.forward  s    [ 7?+-)-"3#-$7 # 	 	 &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ %#&snUL'5'<'<ZW[WeWegs't$Jz+D+DE
 /H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK ##v-+2I'224L#..0J'224L (/EAID5	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&s26kk+.D  r<    r   )rB   rC   rD   rE   r   rO   r   r   r   r   r	   r   r@   rF   rG   rH   s   @r:   r   r     s     2637*."'59\!||\! #5<<#=>\! !.	\!
 u//0\! !\!  \! \! !!1!12\! 
u||Xell3XeELL>Q5RR	S\! \!r<   r   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )DiffLlamaRMSNormi  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z/
DiffLlamaRMSNorm is equivalent to T5LayerNorm
N)r,   r-   r   r   rO   onesr   variance_epsilon)r8   r/   rt   r9   s      r:   r-   DiffLlamaRMSNorm.__init__  s/     	ll5::k#:; #r<   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )NrK   rJ   T)keepdim)	r   r   rO   r   powmeanrsqrtr   r   )r8   r_   r   variances       r:   r@   DiffLlamaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r<   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rN   r   r8   s    r:   
extra_reprDiffLlamaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr<   )r   r   )gư>)	rB   rC   rD   rE   r-   r@   r   rF   rG   rH   s   @r:   r   r     s    $;J Jr<   r   )eagerflash_attention_2sdpac                     ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )DiffLlamaDecoderLayeri  r.   rm   c                 (  > [         TU ]  5         UR                  U l        [        UR                     " XS9U l        [        U5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        g )N)r.   rm   rt   )r,   r-   r/   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnr&   mlpr   r   input_layernormpost_attention_layernormr   s      r:   r-   DiffLlamaDecoderLayer.__init__  sw    !--4V5P5PQY_u'/0B0BH[H[\(89K9KQWQdQd(e%r<   r_   r   rZ   r   r   r   r   r   r   ra   c	                     Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pX-   nUn
U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )Nr   r   )r  r  r	  r  )r8   r_   r   rZ   r   r   r   r   r   r   residualself_attn_weightsoutputss                r:   r@   DiffLlamaDecoderLayer.forward  s     !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !0 !55mD/ 0 "++Gr<   )r/   r  r  r	  r  )NNNFFNN)rB   rC   rD   rE   r"   r   r-   rO   r   r   r   r	   r   r   r   r   FloatTensorr@   rF   rG   rH   s   @r:   r  r    s   f f3 f 2637*.,1$)59KO'||' !.' u//0	'
 !' $D>' D>' !!1!12' &eELL%,,,F&GH' -.' 
u  (51B1BEDUDU1U+V"WW	X' 'r<   r  c                   N    \ rS rSr\rSrSrS/rS/r	Sr
SrSrSrSrSrSrS rSrg	)
DiffLlamaPreTrainedModeli:  modelTr  past_key_valuesFc                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR
                  R                  R                  S5        g [        U[        5      (       a  UR                  R                  R                  SU R                   R                   5        UR"                  R                  R                  SU R                   R                   5        UR$                  R                  R                  SU R                   R                   5        UR&                  R                  R                  SU R                   R                   5        g g )Nr   )r   stdg      ?r   )r.   initializer_ranger   r   r1   r   datanormal_r+   zero_	Embeddingpadding_idxr   fill_rp   r   r   r   r   r   )r8   moduler  s      r:   _init_weights&DiffLlamaPreTrainedModel._init_weightsI  s   kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . 011MM$$S) 233!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH!!))!T[[-G-GH	 4r<   r   N)rB   rC   rD   rE   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr  rF   r   r<   r:   r  r  :  sT    "L&*#01#4"5!N  $!"'Ir<   r  c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )DiffLlamaRotaryEmbeddingi\  r.   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typer   defaultinv_freqF)
persistent)r,   r-   r   r0  getr1  r}   max_seq_len_cachedoriginal_max_seq_lenr.   r   rope_init_fnattention_scalingregister_bufferr3  original_inv_freq)r8   r.   r   r3  r9   s       r:   r-   !DiffLlamaRotaryEmbedding.__init__]  s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r<   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rJ   r!   mpscpuF)device_typeenabledrK   rL   )r   )r3  floatrc   rN   r   r   r   r   strrO   autocastr   rP   rX   r9  rY   r   )
r8   r?   rZ   inv_freq_expandedposition_ids_expandedr@  freqsembrX   rY   s
             r:   r@    DiffLlamaRotaryEmbedding.forwardn  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r9  r.   r6  r;  r7  r8  r1  r>   )rB   rC   rD   rE   r"   r-   rO   no_gradr   r@   rF   rG   rH   s   @r:   r.  r.  \  s6    / / /" ]]_<  <r<   r.  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	         SS\
\R                     S\
\R                     S\
\R                     S	\
\   S
\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\   S\4S jj5       5       r SS\\R                  S4   S\R                  S\R                  S	\S\4
S jjr\S\R                  S\S\S\R2                  S\R                  S\4S j5       rSrU =r$ )DiffLlamaModeli~  r.   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr  )r.   F)r,   r-   pad_token_idr  
vocab_sizer   r  r/   embed_tokens
ModuleListrangenum_hidden_layersr  layersr   r   normr.  r   gradient_checkpointing	post_initr   s      r:   r-   DiffLlamaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfgGf)"65Gfg
 %V%7%7V=P=PQ	2&A&+# 	 hs   C?c                     U R                   $ r>   rP  r   s    r:   get_input_embeddings#DiffLlamaModel.get_input_embeddings  s       r<   c                     Xl         g r>   rZ  r8   values     r:   set_input_embeddings#DiffLlamaModel.set_input_embeddings  s    !r<   	input_idsr   rZ   r  inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsra   c
                 J   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        Sn[        U[        S 5      [        45      (       d  [	        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                   " XUR"                  S   -   UR$                  S9n	Uc  U	R'                  S5      nU R)                  X%XU5      nUnU R+                  X5      nU(       a  SOS nU(       a  SOS nU R,                  S U R                   R.                    H7  nU(       a  X4-  nU" U4UUUUUU	US	.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R1                  U5      nU(       a  X4-  n[3        UU(       a  UOS UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r!   r   r   )r   rZ   r   r   r   r   r   )last_hidden_stater  r_   
attentions)r.   r   rd  r   r   rV  r   rv   rw   r   r   r	   rP  r
   get_seq_lengthrO   arangerN   r   rU   _update_causal_maskr   rT  rS  rU  r   )r8   rb  r   rZ   r  rc  r   r   rd  r   re  past_seen_tokensr   r_   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r:   r@   DiffLlamaModel.forward  sI    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I /DJ+>??abb  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r<   r#   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nr   r   flex_attentionr   Fr   )rc  past_key_values_lengthis_trainingr!   rJ   )sequence_lengthtarget_lengthr   r   
batch_size)r   xpunpu)r.   r  anyr   rO   r   r$   rj  is_compileabler   _ignore_causal_mask_sdpar   r   rN   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfomin_unmask_unattended)r8   r   rs  r   r  r   rm  using_compilable_cacher   rx  ry  r   	min_dtypes                r:   rl  "DiffLlamaModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr<   rx  ry  r   rz  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r!   )diagonalrg  rJ   r   )rM   rO   r  r  fullr   triurk  rd   rc   clonerN   r   masked_fill)r   rx  ry  r   r   rz  r   r   r  mask_lengthpadding_masks              r:   r  DDiffLlamaModel._prepare_4d_causal_attention_mask_with_cache_position:  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r<   )rP  rV  rT  rU  r  r   rO  	NNNNNNNNN)F)rB   rC   rD   rE   r"   r-   r[  r`  r   r   r   rO   r   r   r	   r  r   r   r   r   r@   r   rl  staticmethodr   r   r  rF   rG   rH   s   @r:   rL  rL  ~  s     !"  151537+/59$(,0/359\
E,,-\
 !.\
 u//0	\

 "%\
   1 12\
 D>\
 $D>\
 'tn\
 !!1!12\
 $$89\
 
!\
  \
H #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r<   rL  c                       \ rS rSrSrg)KwargsForCausalLMir  r   N)rB   rC   rD   rE   rF   r   r<   r:   r  r  r  s    3r<   r  c                     ^  \ rS rSrS/rSS0rSS/S/40rU 4S jrS rS	 r	S
 r
S rS rS r\\           SS\\R$                     S\\R&                     S\\R$                     S\\   S\\R*                     S\\R$                     S\\   S\\   S\\   S\\R$                     S\\\R&                  4   S\\   S\4S jj5       5       rSrU =r$ )DiffLlamaForCausalLMiu  zlm_head.weightlm_headcolwise_repr_   logitsc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r)   )
r,   r-   rL  r  rO  r   r1   r/   r  rW  r7   s     r:   r-   DiffLlamaForCausalLM.__init__{  sU     #F+
 ++yy!3!3V5F5FUS 	r<   c                 .    U R                   R                  $ r>   r  rP  r   s    r:   r[  )DiffLlamaForCausalLM.get_input_embeddings      zz&&&r<   c                 $    XR                   l        g r>   r  r^  s     r:   r`  )DiffLlamaForCausalLM.set_input_embeddings      "'

r<   c                     U R                   $ r>   r  r   s    r:   get_output_embeddings*DiffLlamaForCausalLM.get_output_embeddings  s    ||r<   c                     Xl         g r>   r  )r8   new_embeddingss     r:   set_output_embeddings*DiffLlamaForCausalLM.set_output_embeddings  s    %r<   c                     Xl         g r>   r  )r8   decoders     r:   set_decoder DiffLlamaForCausalLM.set_decoder  s    
r<   c                     U R                   $ r>   r  r   s    r:   get_decoder DiffLlamaForCausalLM.get_decoder  s    zzr<   rb  r   rZ   r  rc  labelsr   r   rd  r   logits_to_keepr   ra   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DiffLlamaForCausalLM

>>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```N)	rb  r   rZ   r  rc  r   r   rd  r   )r  r  rO  lossr  r  r_   ri  r   )r.   r   rd  r  rh  r   r   slicer  loss_functionrO  r   r  r_   ri  )r8   rb  r   rZ   r  rc  r  r   r   rd  r   r  r   r  r_   slice_indicesr  r  s                     r:   r@   DiffLlamaForCausalLM.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD%#33!//))
 	
r<   )r  r  rO  )NNNNNNNNNNr   )rB   rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr-   r[  r`  r  r  r  r  r   r   r   rO   r   r   r	   r  r   r   r   r   r  r   r@   rF   rG   rH   s   @r:   r  r  u  s   *+=)H_-z:;H'(&  151537+/59-1$(,0/35934G
E,,-G
 !.G
 u//0	G

 "%G
   1 12G
 ))*G
 D>G
 $D>G
 'tnG
 !!1!12G
 c5<</0G
 *+G
 
 G
  G
r<   r  a  
    The DiffLlama Model transformer with a sequence classification head on top (linear layer).

    [`DiffLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   *  ^  \ rS rSrU 4S jrS rS r\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\4S jj5       5       rSrU =r$ )"DiffLlamaForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r)   )
r,   r-   
num_labelsrL  r  r   r1   r/   scorerW  r7   s     r:   r-   +DiffLlamaForSequenceClassification.__init__  sS      ++#F+
YYv114??O
 	r<   c                 .    U R                   R                  $ r>   r  r   s    r:   r[  7DiffLlamaForSequenceClassification.get_input_embeddings  r  r<   c                 $    XR                   l        g r>   r  r^  s     r:   r`  7DiffLlamaForSequenceClassification.set_input_embeddings  r  r<   rb  r   rZ   r  rc  r  r   r   rd  ra   c
                    U R                  UUUUUUUU	S9n
U
R                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  c  US:w  a  [        S5      eU R                  R
                  c  SnOUb  XR                  R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUb  U R#                  XUU R                  S
9n[%        UUU
R&                  U
R(                  U
R*                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   rZ   r  rc  r   r   rd  Nr   r!   z=Cannot handle batch sizes > 1 if no padding token is defined.rJ   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rg  )r  r  pooled_logitsr.   r  )r  rh  r  rN   r.   rN  r   r   r   rO   int32rk  argmaxrv   rw   r9   rB   r  r   r  r_   ri  )r8   rb  r   rZ   r  rc  r  r   r   rd  transformer_outputsr_   r  rz  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                      r:   r@   *DiffLlamaForSequenceClassification.forward   s   * 8<zz)%+'/!5 8B 	8
 ,==M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r<   )r  r  r  r  )rB   rC   rD   rE   r-   r[  r`  r   r   r   rO   r   r   r	   r  r   r   r@   rF   rG   rH   s   @r:   r  r    s    '(  151537+/59-1$(,0/3A
E,,-A
 !.A
 u//0	A

 "%A
   1 12A
 ))*A
 D>A
 $D>A
 'tnA
 
*A
  A
r<   r  c                   B  ^  \ rS rSrSrU 4S jrS rS r\\	         SS\
\R                     S\
\R                     S\
\R                     S	\
\   S
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\4S jj5       5       rSrU =r$ )DiffLlamaForQuestionAnsweringiF  transformerc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )NrK   )	r,   r-   rL  r  r   r1   r/   
qa_outputsrW  r7   s     r:   r-   &DiffLlamaForQuestionAnswering.__init__J  sA     )&1))F$6$6: 	r<   c                 .    U R                   R                  $ r>   r  rP  r   s    r:   r[  2DiffLlamaForQuestionAnswering.get_input_embeddingsR  s    ,,,r<   c                 $    XR                   l        g r>   r  r^  s     r:   r`  2DiffLlamaForQuestionAnswering.set_input_embeddingsU  s    (-%r<   rb  r   rZ   r  rc  start_positionsend_positionsr   rd  ra   c
           
         U R                  UUUUUUU	S9nUR                  nU R                  U5      nUR                  SSS9u  pUR	                  S5      R                  5       nUR	                  S5      R                  5       nS nUb  Ub  U R                  " XXg40 U
D6n[        UUUUR                  UR                  S9$ )N)r   rZ   r  rc  r   rd  r!   rJ   rL   )r  start_logits
end_logitsr_   ri  )
r  rh  r  splitsqueezer   r  r   r_   ri  )r8   rb  r   rZ   r  rc  r  r  r   rd  r   r  sequence_outputr  r  r  r  s                    r:   r@   %DiffLlamaForQuestionAnswering.forwardX  s     ,0+;+;)%+'/!5 ,< ,
 "331#)<<r<#: #++B/::<''+668
&=+D%%libhiD+%!!//))
 	
r<   )r  r  r  )rB   rC   rD   rE   r"  r-   r[  r`  r   r   r   rO   r   r   r	   r  r   r   r@   rF   rG   rH   s   @r:   r  r  F  s    %-.  151537+/596:48,0/3(
E,,-(
 !.(
 u//0	(

 "%(
   1 12(
 "%"2"23(
   0 01(
 $D>(
 'tn(
 
&(
  (
r<   r  c                   *  ^  \ rS rSrU 4S jrS rS r\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\4S jj5       5       rSrU =r$ )DiffLlamaForTokenClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        USS 5      b  UR                  nO[        USS 5      b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropouthidden_dropoutg?)r,   r-   r  rL  r  r{   r  r  r   Dropoutr   r1   r/   r  rW  )r8   r.   r  r9   s      r:   r-   (DiffLlamaForTokenClassification.__init__  s      ++#F+
6/6B!'!:!:V-t4@!'!6!6!$zz"45YYv1163D3DE
 	r<   c                 .    U R                   R                  $ r>   r  r   s    r:   r[  4DiffLlamaForTokenClassification.get_input_embeddings  r  r<   c                 $    XR                   l        g r>   r  r^  s     r:   r`  4DiffLlamaForTokenClassification.set_input_embeddings  r  r<   rb  r   rZ   r  rc  r  r   r   rd  ra   c
                    U R                  UUUUUUUU	S9n
U
R                  nU R                  U5      nU R                  U5      nSnUb  U R	                  XU R
                  5      n[        UUU
R                  U
R                  S9$ )r  r  N)r  r  r_   ri  )	r  rh  r   r  r  r.   r   r_   ri  )r8   rb  r   rZ   r  rc  r  r   r   rd  r  r  r  r  s                 r:   r@   'DiffLlamaForTokenClassification.forward  s    * ,0::)%+'/!5 ,6 	,
 "33,,7O,%%fdkkBD$!//))	
 	
r<   )r   r  r  r  r  )rB   rC   rD   rE   r-   r[  r`  r   r   r   rO   r   r   r	   r  r   r   r@   rF   rG   rH   s   @r:   r  r    s     '(  151537+/59-1$(,0/3*
E,,-*
 !.*
 u//0	*

 "%*
   1 12*
 ))**
 D>*
 $D>*
 'tn*
 
*
  *
r<   r  )r  rL  r  r  r  r  )Nr!   )Lrk   typingr   r   r   rO   r   activationsr   cache_utilsr	   r
   r   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r    configuration_diffllamar"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerrB   rv   Moduler&   rS   r^   r   r   ri   rn   rp   r   r   r   r  r  r  r.  rL  r  r  r  r  r  __all__r   r<   r:   <module>r     s&  0  ) )   ! ; ; ) 7 > 
 :  L - & h h 4  !!;J 
		H	%299  (6	UU\\ 	U# 	U%,, 	U2g) g)TS)1 S)ld!/ d!N Y'Jryy J (J*  1" 26 2j I I IB<ryy <D p- p pf ?,j > i
3_ i
 i
X S
)A S
S
l ;
$< ;
 ;
| C
&> C
 C
Lr<   