
    fTh                        S SK Jr  S SKJrJrJrJr  S SKrS SKJ	r	  SSK
Jr  SSKJrJrJr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  \$" 5       (       a  S SK*J+r+  SSK,J-r-  \%R\                  " \/5      r0 " S S\	Rb                  5      r2 " S S\	Rb                  5      r3S r4S9S jr5S\Rl                  S\7S\Rl                  4S jr8   S:S\	Rb                  S\Rl                  S\Rl                  S \Rl                  S!\\Rl                     S"\9S#\\9   S$\\9   S\\Rl                  \Rl                  4   4S% jjr: " S& S'\	Rb                  5      r; " S( S)\	Rb                  5      r< " S* S+\	Rb                  5      r=\" " S, S-\5      5       r>\" " S. S/\>5      5       r?\" " S0 S1\>\5      5       r@\"" S2S39 " S4 S5\>5      5       rA\" " S6 S7\>5      5       rB/ S8QrCg);    )partial)CallableOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )Gemma2Config)	BlockMask)make_flex_block_causal_maskc                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
Gemma2RMSNorm7   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r%   nn	Parametertorchzerosweight)selfr$   r%   	__class__s      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gemma2/modeling_gemma2.pyr)   Gemma2RMSNorm.__init__8   s,    ll5;;s#34    c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )N   T)keepdim)r,   rsqrtpowmeanr%   )r/   xs     r1   _normGemma2RMSNorm._norm=   s4    5;;quuQx}}R}>IJJJr3   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )N      ?)r<   floatr.   type_as)r/   r;   outputs      r1   forwardGemma2RMSNorm.forward@   sC    AGGI& 3!2!2!445~~a  r3   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler.   shaper%   r/   s    r1   
extra_reprGemma2RMSNorm.extra_reprG   s'    ))*+6$((<<r3   )r%   r.   )gư>)__name__
__module____qualname____firstlineno__intr@   r)   r<   rC   rI   __static_attributes____classcell__r0   s   @r1   r"   r"   7   s0    5C 5e 5 5
K!= =r3   r"   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	Gemma2MLPK   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)r(   r)   confighidden_sizeintermediate_sizer*   Linear	gate_projup_proj	down_projr	   hidden_activationact_fnr/   rZ   r0   s     r1   r)   Gemma2MLP.__init__L   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r3   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r'   )r`   rb   r^   r_   )r/   r;   r`   s      r1   rC   Gemma2MLP.forwardV   s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )rb   rZ   r`   r^   r[   r\   r_   )rK   rL   rM   rN   r)   rC   rP   rQ   rR   s   @r1   rT   rT   K   s    7 r3   rT   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr6   r5   r$   )rG   r,   cat)r;   x1x2s      r1   rotate_halfrl   [   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerl   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r1   apply_rotary_pos_embrw   b   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr3   hidden_statesn_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rG   expandreshape)rx   ry   batchnum_key_value_headsslenhead_dims         r1   	repeat_kvr   }   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr3   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r5   r   r6   )r$   dtype)ptrainingr   )r   r   num_key_value_groupsr,   matmul	transposetanhrG   r*   
functionalsoftmaxfloat32tor   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r1   eager_attention_forwardr      s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r3   c                   F  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )Gemma2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperrZ   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R*                  U l        [-        US-  5      (       d  UR.                  U l        g S U l        g )Nr   r   TrX   r5   )r(   r)   rZ   r   getattrr[   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr*   r]   attention_biasq_projk_projv_projo_projattn_logit_softcappingboolsliding_windowr/   rZ   r   r0   s      r1   r)   Gemma2Attention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#;?	A;N;Nf33TXr3   rx   position_embeddingsr   past_key_valuecache_positionr   rz   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb}  UUUU R                  S.nUR                  XU R                  U5      u  pUbJ  U R                  R                  S:X  a0  UR                   S   nU
S S 2S S 2S U2S S 24   US S 2S S 2S U2S S 24   p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS	5      (       a  [        R!                  S
5        O["        U R                  R                     nU" U U	U
UU4U R$                  (       a  U R&                  OSU R(                  U R                  U R*                  S.UD6u  nnUR,                  " / UQSP76 R/                  5       nU R1                  U5      nUU4$ )Nr6   r   r5   )rr   rq   r   r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r   )rG   r   r   viewr   r   r   rw   r   updater   rZ   _attn_implementationr   getloggerwarning_oncer   r   r   r   r   r}   r   r   )r/   rx   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rq   rr   cache_kwargsseq_lenattention_interfacer   r   s                      r1   rC   Gemma2Attention.forward   sS    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ % "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7%
 /3mmD**LL..//%
 %
!\ "));;;;FFHkk+.L((r3   )r   r   rZ   r   r   r   r   r   r   r   r   r   r   )NN)rK   rL   rM   rN   __doc__r   rO   r)   r,   Tensorr   r   r
   
LongTensorr   r   rC   rP   rQ   rR   s   @r1   r   r      s    GY| Y Y< +/59;)||;) #5<<#=>;) !.	;)
 !;) !!1!12;) -.;) 
u||Xell3XeELL>Q5RR	S;) ;)r3   r   c                     ^  \ rS rSrS\S\4U 4S jjr\" SSS9      SS\R                  S	\
\R                  \R                  4   S
\\R                     S\\R                     S\\   S\\   S\\   S\\R                     S\
\R                  \\
\R                  \R                  4      4   4S jj5       rSrU =r$ )Gemma2DecoderLayeri  rZ   r   c                   > [         TU ]  5         UR                  U l        Xl        [	        US-  5      (       + U l        [        XS9U l        [        U5      U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR                   U l        g )Nr5   )rZ   r   r%   )r(   r)   r[   rZ   r   
is_slidingr   	self_attnrT   mlpr"   rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   s      r1   r)   Gemma2DecoderLayer.__init__  s    !--"9q=11(LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%)6v7I7IvObOb)c&*78J8JPVPcPc*d'$33r3   last_cache_positionz4.53.0)versionrx   r   r   rs   r   r   	use_cacher   rz   c	                    U R                   (       Ga8  UGb4  [        UR                  S   U R                  5      n
U R                  R
                  S:X  a  US S 2U
* S 24   nO[        R                  " UR                  5      R                  n[        R                  " [        R                  " U[        R                  S9U R                  * S9n[        R                  " XU5      nUS   U
-
  S-   n[        R                  " USS9n[        R                  " [        XR                  S   5      UR                   S9nX-  nUS S 2S S 2S S 2U4   nUnU R#                  U5      nU R$                  " S
UUUUUUUUS	.U	D6u  nnU R'                  U5      nX-   nUnU R)                  U5      nU R+                  U5      nU R-                  U5      nX-   nU4nU(       a  UU4-  nU$ )Nr   r   r   diagonalr6   r   )mindevice)rx   r   r   rs   r   r   r   r    )r   maxrG   r   rZ   r   r,   finfor   r   tril	ones_liker   whereclamparanger   r   r   r   r   r   r   )r/   rx   r   r   rs   r   r   r   r   r   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualself_attn_weightsoutputss                     r1   rC   Gemma2DecoderLayer.forward  s    ???~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@^!\'+.??!CV3  %||)+?+?+CD^MbMb  &!/1a0E!F ,,]; ,0>> 
,
' 3)%)/)
,
 
,
(( 55mD 0 66}E/77F 0 ")++Gr3   )
rZ   r[   r   r   r   r   r   r   r   r   )NNNFFN)rK   rL   rM   rN   r   rO   r)   r   r,   r   r   r   r   r
   r   FloatTensorrC   rP   rQ   rR   s   @r1   r   r     s   4| 4 4 *H=
 2637*.,1$)59E||E #5<<#=>E !.	E
 u//0E !E $D>E D>E !!1!12E 
u  (51B1BEDUDU1U+V"WW	XE >Er3   r   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Gemma2RotaryEmbeddingi^  rZ   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r(   r)   hasattrr   r   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrZ   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r/   rZ   r   r   r0   s       r1   r)   Gemma2RotaryEmbedding.__init___  s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r3   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r6   r   mpscpuF)device_typeenabledr5   rh   r   )r   r@   r|   rG   r   r   
isinstancer   strr,   autocastr   ri   rq   r  rr   r   )
r/   r;   rs   inv_freq_expandedposition_ids_expandedr  freqsembrq   rr   s
             r1   rC   Gemma2RotaryEmbedding.forwardp  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r  rZ   r  r  r  r  r   r'   )rK   rL   rM   rN   r   r)   r,   no_gradr   rC   rP   rQ   rR   s   @r1   r   r   ^  s6    /| / /" ]]_<  <r3   r   c                   N    \ rS rSr\rSrSrS/rS/r	Sr
SrSrSrSrSrSrS rSrg)	Gemma2PreTrainedModeli  modelTr   past_key_valuesc                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR
                  R                  R                  S5        g g )Nr   )r:   stdr?   )rZ   initializer_ranger  r*   r]   r.   datanormal_rY   zero_	Embeddingpadding_idxr"   fill_)r/   r   r  s      r1   _init_weights#Gemma2PreTrainedModel._init_weights  s    kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> ...MM$$S) /r3   r   N)rK   rL   rM   rN   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr$  rP   r   r3   r1   r  r    sS    L&*#-.#4"5!N  $!"&*r3   r  c                   6  ^  \ rS rSrS\4U 4S jjrS rS r\\	         SS\
\R                     S\
\R                     S\
\R                     S	\
\   S
\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\   S\4S jj5       5       r\R*                  " 5        SS\\R                  S4   S\R                  S\R                  S	\S\4
S jj5       r\S\R                  S\S\S\R4                  S\R                  S\4S j5       rSrU =r$ )Gemma2Modeli  rZ   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   )rZ   F)r(   r)   pad_token_idr"  
vocab_sizer*   r!  r[   embed_tokens
ModuleListrangenum_hidden_layersr   layersr"   r   normr   
rotary_embgradient_checkpointing	post_initr   s      r1   r)   Gemma2Model.__init__  s     !.. ++LL):):F<N<NPTP`P`ammDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	/v>&+# 	 es   C?c                     U R                   $ r'   r7  rH   s    r1   get_input_embeddings Gemma2Model.get_input_embeddings  s       r3   c                     Xl         g r'   rB  r/   r   s     r1   set_input_embeddings Gemma2Model.set_input_embeddings  s    !r3   	input_idsr   rs   r  inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsrz   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       aN  UcK  U R                  (       d:  UR                  u  pn[        U R                   UUUR                  U R                  S9nU	cD  Ub  UR                  5       OSn[        R                   " XUR                  S   -   UR                  S9n	Uc  U	R#                  S5      nU R%                  X%XU5      nUnU R'                  UU5      n[        R(                  " U R                   R*                  S-  UR                  S	9nUU-  nU(       a  S
OS nU(       a  S
OS nU R,                  S U R                   R.                    H  nU(       a  UU4-  nU R
                  (       a?  U R                  (       a.  U R1                  [3        UR4                  40 U
D6UUUUUUUU	5	      nOU" U4UUUUUUU	S.U
D6nUS   nU(       d  M  UUS   4-  nM     U R7                  U5      nU(       a  UU4-  n[9        UUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenr   r   r   r   r   g      ?r   r   )r   r   rs   r   r   r   r   )last_hidden_stater  rx   
attentions)rZ   r   rK  r   
ValueErrorr>  r   r   r   r7  rG   r   r   r   get_seq_lengthr,   r   rn   _update_causal_maskr=  tensorr[   r;  r:  _gradient_checkpointing_funcr   __call__r<  r   )r/   rI  r   rs   r  rJ  r   r   rK  r   rL  
batch_sizer   _past_seen_tokensr   rx   r   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          r1   rC   Gemma2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#)){{O !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]

 & #oom\J
 \\$++"9"93">mFYFYZ
%
2 #7BD0d![[)H4;;+H+HIM#!m%55!**t}} $ A AM22H6GH!' #%"
! !.!
!(;#.!-#2&7'#1
! (
! *!,M  =#3"55A JD 		-0-!11&+++%	
 	
r3   r   input_tensorc           
         U R                   R                  S:X  a  U$ U R                   R                  S:X  a,  [        U[        R                  5      (       a  [        U5      nU$ UR                  UR                  pvUR                  S   n[        U[        [        45      (       a  UR                  5       n	O!Ub  UR                  S   OUR                  S   n	U R                  UUU	UUUUR                  S   S9n
U
$ )Nr   flex_attentionr   r6   r   sequence_lengthtarget_lengthr   r   r   rX  )rZ   r   r  r,   r   r    r   r   rG   r   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)r/   r   ra  r   r  r   r   r   re  rf  r   s              r1   rT  Gemma2Model._update_causal_mask,  s     ;;++/BB!!;;++/??.%,,77!<^!L!!$**L,?,?v&,,Q/o['ABB+??AM8F8RN004XdXjXjklXmM PP+')#))!, Q 
 r3   re  rf  r   rX  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   r   r   r6   r   )r$   r,   r   r   fullr   triur   r}   r|   clonerG   r   masked_fill)r   re  rf  r   r   rX  r   r   r   mask_lengthpadding_masks              r1   rh  AGemma2Model._prepare_4d_causal_attention_mask_with_cache_positionS  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   )r7  r>  r;  r<  r"  r=  r6  	NNNNNNNNN)F)rK   rL   rM   rN   r   r)   rC  rG  r   r   r   r,   r   r   r   r   r   r   r   r   rC   r  r   rT  staticmethodrO   r   rh  rP   rQ   rR   s   @r1   r3  r3    s   |  !"  1515371559$(,0/359s
E,,-s
 !.s
 u//0	s

 "+.s
   1 12s
 D>s
 $D>s
 'tns
 !!1!12s
 $$89s
 
!s
  s
j ]]_ #($ellK78$ ll$ 	$
 %$  $ $L 444 4 {{	4
 4 4 4r3   r3  c                     ^  \ rS rSrS/rSS0rSS/S/40rU 4S jrS rS	 r	S
 r
S rS rS r\\           SS\\R$                     S\\R&                     S\\R$                     S\\   S\\R*                     S\\R$                     S\\   S\\   S\\   S\\R$                     S\\\R&                  4   S\4S jj5       5       r       SU 4S jjrSrU =r$ )Gemma2ForCausalLMi  zlm_head.weightlm_headcolwise_reprx   logitsc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g rW   )
r(   r)   r3  r  r6  r*   r]   r[   rx  r?  rc   s     r1   r)   Gemma2ForCausalLM.__init__  sU      (
 ++yy!3!3V5F5FUS 	r3   c                 .    U R                   R                  $ r'   r  r7  rH   s    r1   rC  &Gemma2ForCausalLM.get_input_embeddings      zz&&&r3   c                 $    XR                   l        g r'   r~  rF  s     r1   rG  &Gemma2ForCausalLM.set_input_embeddings      "'

r3   c                     U R                   $ r'   rx  rH   s    r1   get_output_embeddings'Gemma2ForCausalLM.get_output_embeddings  s    ||r3   c                     Xl         g r'   r  )r/   new_embeddingss     r1   set_output_embeddings'Gemma2ForCausalLM.set_output_embeddings  s    %r3   c                     Xl         g r'   r  )r/   decoders     r1   set_decoderGemma2ForCausalLM.set_decoder  s    
r3   c                     U R                   $ r'   r  rH   s    r1   get_decoderGemma2ForCausalLM.get_decoder  s    zzr3   rI  r   rs   r  rJ  labelsr   r   rK  r   logits_to_keeprz   c                 F   U R                   (       aG  U R                  R                  S:w  a-  [        R	                  SU R                  R                   S35        Ub  UOU R                  R
                  nU	b  U	OU R                  R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bH  UU R                  R                  -  n[        R                  " U5      nUU R                  R                  -  nSnUb  U R                   " UX`R"                  40 UD6n[%        UUUR&                  UR(                  UR*                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Gemma2ForCausalLM

>>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rI  r   rs   r  rJ  r   r   rK  r   lossrz  r  rx   rQ  r   )r   rZ   r   r   r   r   rK  r  rP  r  rO   slicerx  final_logit_softcappingr,   r   loss_functionr6  r   r  rx   rQ  )r/   rI  r   rs   r  rJ  r  r   r   rK  r   r  loss_kwargsr   rx   slice_indicesrz  r  s                     r1   rC   Gemma2ForCausalLM.forward  s   P ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%%ffooUUD%#33!//))
 	
r3   c	                   > [         TU ]  " U4UUUUUUUS.U	D6n
Uc  U
R                  SS 5      n[        U[        5      (       a  UR
                  S:X  a  U R                  R                  S:X  d  U
S   b"  U
S   R                  u  pnU
S   R                  nO U
S   R                  u  pU
S   R                  nU R                  R                  UUUR                  5       U R                  R                  R                  UUUS9nX:S'   U
$ )	N)r  r   rJ  r   rs   r   r  r  r5   r   rJ  rI  rd  r   )r(   prepare_inputs_for_generationpopr  r   ndimrZ   r   rG   r   r  rh  rg  rx  r.   r   )r/   rI  r  r   rJ  r   rs   r   r  r   model_inputsrY  rX  re  r   r0   s                  r1   r  /Gemma2ForCausalLM.prepare_inputs_for_generation   s2    w<

+)')%)

 

 !  !148A 44##q(KK448KKO,81=o1N1T1T.
Q%o6==.:;.G.M.M+
%k299!ZZ]] /-AACll))//-% ^ N .<)*r3   )rx  r  r6  )NNNNNNNNNNr   )NNNNNTN)rK   rL   rM   rN   _tied_weights_keys_tp_plan_pp_planr)   rC  rG  r  r  r  r  r   r   r   r,   r   r   r   r   r   r   rO   r   rC   r  rP   rQ   rR   s   @r1   rw  rw    s   *+=)H_-z:;H'(&  1515371559-1$(,0/35934P
E,,-P
 !.P
 u//0	P

 "+.P
   1 12P
 ))*P
 D>P
 $D>P
 'tnP
 !!1!12P
 c5<</0P
 
 P
  P
j 4 4r3   rw  a  
    The Gemma2 Model transformer with a sequence classification head on top (linear layer).

    [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   *  ^  \ rS rSrU 4S jrS rS r\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\4S jj5       5       rSrU =r$ )Gemma2ForSequenceClassificationi7  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g rW   )
r(   r)   
num_labelsr3  r  r*   r]   r[   scorer?  rc   s     r1   r)   (Gemma2ForSequenceClassification.__init__F  sS      ++ (
YYv114??O
 	r3   c                 .    U R                   R                  $ r'   r~  rH   s    r1   rC  4Gemma2ForSequenceClassification.get_input_embeddingsO  r  r3   c                 $    XR                   l        g r'   r~  rF  s     r1   rG  4Gemma2ForSequenceClassification.set_input_embeddingsR  r  r3   rI  r   rs   r  rJ  r  r   r   rK  rz   c
                    U R                  UUUUUUUU	S9n
U
R                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  c  US:w  a  [        S5      eU R                  R
                  c  SnOUb  XR                  R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUb  U R#                  XUU R                  S
9n[%        UUU
R&                  U
R(                  U
R*                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   rs   r  rJ  r   r   rK  Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )rz  r  pooled_logitsrZ   r  )r  rP  r  rG   rZ   r5  rR  r   r   r,   int32r   argmaxr   r   r0   rK   r  r   r  rx   rQ  )r/   rI  r   rs   r  rJ  r  r   r   rK  transformer_outputsrx   rz  rX  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                      r1   rC   'Gemma2ForSequenceClassification.forwardU  s   * 8<zz)%+'/!5 8B 	8
 ,==M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r3   )r  r  r  rt  )rK   rL   rM   rN   r)   rC  rG  r   r   r   r,   r   r   r
   r   r   r   rC   rP   rQ   rR   s   @r1   r  r  7  s    '(  151537+/59-1$(,0/3A
E,,-A
 !.A
 u//0	A

 "%A
   1 12A
 ))*A
 D>A
 $D>A
 'tnA
 
*A
  A
r3   r  c                   *  ^  \ rS rSrU 4S jrS rS r\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\4S jj5       5       rSrU =r$ )Gemma2ForTokenClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        USS 5      b  UR                  nO[        USS 5      b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropouthidden_dropoutg?)r(   r)   r  r3  r  r   r  r  r*   Dropoutr   r]   r[   r  r?  )r/   rZ   r  r0   s      r1   r)   %Gemma2ForTokenClassification.__init__  s      ++ (
6/6B!'!:!:V-t4@!'!6!6!$zz"45YYv1163D3DE
 	r3   c                 .    U R                   R                  $ r'   r~  rH   s    r1   rC  1Gemma2ForTokenClassification.get_input_embeddings  r  r3   c                 $    XR                   l        g r'   r~  rF  s     r1   rG  1Gemma2ForTokenClassification.set_input_embeddings  r  r3   rI  r   rs   r  rJ  r  r   r   rK  rz   c
                    U R                  UUUUUUUU	S9n
U
R                  nU R                  U5      nU R                  U5      nSnUb  U R	                  XU R
                  5      n[        UUU
R                  U
R                  S9$ )r  r  N)r  rz  rx   rQ  )	r  rP  r   r  r  rZ   r   rx   rQ  )r/   rI  r   rs   r  rJ  r  r   r   rK  r   sequence_outputrz  r  s                 r1   rC   $Gemma2ForTokenClassification.forward  s    * ,0::)%+'/!5 ,6 	,
 "33,,7O,%%fdkkBD$!//))	
 	
r3   )r   r  r  r  rt  )rK   rL   rM   rN   r)   rC  rG  r   r   r   r,   r   r   r
   r   r   r   rC   rP   rQ   rR   s   @r1   r  r    s     '(  151537+/59-1$(,0/3*
E,,-*
 !.*
 u//0	*

 "%*
   1 12*
 ))**
 D>*
 $D>*
 'tn*
 
*
  *
r3   r  )rw  r3  r  r  r  )Nr   )r   NN)D	functoolsr   typingr   r   r   r   r,   torch.nnr*   activationsr	   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   configuration_gemma2r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerrK   r   Moduler"   rT   rl   rw   r   rO   r   r@   r   r   r   r   r  r3  rw  r  r  __all__r   r3   r1   <module>r     s9  ,  3 3   ! : : ) B  L F & \ \ 0 .  !!;J 
		H	%=BII =(		  (6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FW)bii W)tU Up<BII <D *O * *8 j' j jZ h- h hV S
&; S
S
l C
#8 C
 C
Lr3   