
    fTh                       S SK r S SKJr  S SKJr  S SKJr  S SKJrJ	r	J
r
Jr  S SKrS SKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0J1r1  \(" 5       (       a  S SK2J3r3  SSK4J5r5  \*Rl                  " \75      r8\ " S S\5      5       r9\ " S S\%5      5       r: " S S\Rv                  5      r< " S S\Rz                  5      r> " S S \Rz                  5      r? " S! S"\Rz                  5      r@S# rASFS$ jrBS%\R                  S&\DS'\R                  4S( jrE   SGS)\Rz                  S*\R                  S+\R                  S,\R                  S-\	\R                     S.\FS/\	\F   S0\	\F   S'\
\R                  \R                  4   4S1 jjrG " S2 S3\Rz                  5      rH " S4 S5\Rz                  5      rI\& " S6 S7\!5      5       rJ\& " S8 S9\J5      5       rK\& " S: S;\J\5      5       rL " S< S=\Rz                  5      rM\&" S>S?9 " S@ SA\J5      5       rN\&" SBS?9 " SC SD\J\5      5       rO/ SEQrPg)H    N)Callable)	dataclass)partial)ListOptionalTupleUnion   )ACT2FN)CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	AutoModel   )Gemma3ConfigGemma3TextConfig)	BlockMask)make_flex_block_causal_maskc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)Gemma3ModelOutputWithPast=   a  
Base class for Gemma3 outputs, with hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r)   r   torchFloatTensor__annotations____static_attributes__r*       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.pyr'   r'   =   s    8 8<%"3"34;r4   r'   c                   &   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                     \4      \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Gemma3CausalLMOutputWithPast^   aB  
Base class for Gemma3 causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr)   r*   )r+   r,   r-   r.   r/   r9   r   r0   r1   r2   r:   r;   r	   r   r   r<   r   r=   r)   r3   r*   r4   r5   r7   r7   ^   s    < )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;r4   r7   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbedding   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )NrD   F
persistent)super__init__register_bufferr0   tensor)selfrA   rB   rC   rD   	__class__s        r5   rI   &Gemma3TextScaledWordEmbedding.__init__   s1    D]ELL,ERWXr4   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ N)rH   forwardrD   toweightdtype)rL   rO   rM   s     r5   rR   %Gemma3TextScaledWordEmbedding.forward   s2    wy)D,<,<,?,?@Q@Q,RRRr4   r*   )      ?)r+   r,   r-   r.   r/   intfloatrI   r0   TensorrR   r3   __classcell__rM   s   @r5   r?   r?      sM    Ys Y3 YS Y_d Y YS S Sr4   r?   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	Gemma3MLP   configc                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)rH   rI   r`   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrL   r`   rM   s     r5   rI   Gemma3MLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r4   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rQ   )rk   rm   ri   rj   )rL   xrk   s      r5   rR   Gemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r4   )rm   r`   rk   ri   re   rf   rj   )	r+   r,   r-   r.   r#   rI   rR   r3   r[   r\   s   @r5   r^   r^      s    7/ 7 r4   r^   c                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
Gemma3RMSNorm   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g rQ   )rH   rI   rw   rg   	Parameterr0   zerosrT   )rL   rv   rw   rM   s      r5   rI   Gemma3RMSNorm.__init__   s,    ll5;;s#34r4   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr   T)keepdim)r0   rsqrtpowmeanrw   )rL   rq   s     r5   _normGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr4   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )NrW   )r   rY   rT   type_as)rL   rq   outputs      r5   rR   Gemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r4   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tuplerT   shaperw   rL   s    r5   
extra_reprGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r4   )rw   rT   )gư>)r+   r,   r-   r.   rX   rY   rI   r   rR   r   r3   r[   r\   s   @r5   rt   rt      s0    5C 5e 5 5
K!= =r4   rt   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Gemma3RotaryEmbedding   r`   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqFrF   )rH   rI   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr`   r   rope_init_fnattention_scalingrJ   r   original_inv_freq)rL   r`   devicer   rM   s       r5   rI   Gemma3RotaryEmbedding.__init__   s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r4   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r}   r!   mpscpuF)device_typeenabledr   rv   rU   )r   rY   expandr   rS   r   
isinstancer   strr0   autocast	transposecatcosr   sinrU   )
rL   rq   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   rR   Gemma3RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r`   r   r   r   r   r   rQ   )r+   r,   r-   r.   r#   rI   r0   no_gradr   rR   r3   r[   r\   s   @r5   r   r      s7    // / /" ]]_<  <r4   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr}   r   r   )r   r0   r   )rq   x1x2s      r5   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr4   r<   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)r   r   reshape)r<   r   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr4   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r   r
   r}   )rv   rU   )ptrainingr!   )r   r   num_key_value_groupsr0   matmulr   tanhr   rg   
functionalsoftmaxfloat32rS   rU   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r5   eager_attention_forwardr     s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r4   c                   (  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\
\R                     S\\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )Gemma3Attentioni*  z=Multi-headed attention from 'Attention Is All You Need' paperr`   	layer_idxc                   > [         TU ]  5         [        US-   UR                  -  5      U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        UR                  S-  U l        U R
                  R                  U l        SU l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  U R                  -  UR                  UR&                  S9U l        U R
                  R0                  U l        U R                  (       a  UR2                  OS U l        [5        UR                  UR6                  S9U l        [5        UR                  UR6                  S9U l        g )Nr!   r   r   Trc   )rv   rw   )rH   rI   boolsliding_window_pattern
is_slidingr`   r   getattrre   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout	is_causalrg   rh   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrt   rms_norm_epsq_normk_normrL   r`   r   rM   s      r5   rI   Gemma3Attention.__init__-  s   	A1N1NNO"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;f33D#V=P=PQ#V=P=PQr4   r<   position_embeddingsr   past_key_valuecache_positionr   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  p[        XX5      u  pUb}  UUUU R                  S.nUR                  XU R                  U5      u  pUbJ  U R                  R                  S:X  a0  UR                   S   nU
S S 2S S 2S U2S S 24   US S 2S S 2S U2S S 24   p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR!                  SS	5      (       a  ["        R%                  S
5        O[&        U R                  R                     nUb  UR)                  U	5      nU" U U	U
UU4U R*                  (       a  U R,                  OSU R.                  U R                  S.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )Nr}   r!   r   )r   r   r  r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   viewr   r   r   r   r   r   r   updater   r`   _attn_implementationr   r   loggerwarning_oncer   rS   r   r   r   r   r   r   )rL   r<   r   r   r   r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsseq_lenattention_interfacer   r   s                      r5   rR   Gemma3Attention.forwardJ  s|    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ % "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL(?;;++w6{{//69fjjI\^c>d>d##L '>dkk>^>^&_#%+..|<N$7
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r4   )r   r   r`   r   r   r   r   r   r   r   r   r   r   r   r   r   )NN)r+   r,   r-   r.   r/   r#   rX   rI   r0   rZ   r   r   
LongTensorr   r   r   rR   r3   r[   r\   s   @r5   r   r   *  s    GR/ RC RD +/59@)||@) #\\@) !.	@)
 !@) !!1!12@) -.@) 
u||Xell3XeELL>Q5RR	S@) @)r4   r   c                   |  ^  \ rS rSrS\S\4U 4S jjr\" SSS9      SS\R                  S	\R                  S
\R                  S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )Gemma3DecoderLayeri  r`   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        U R                  R                  U l        UR                   U l        g )N)r`   r   rw   )rH   rI   r`   re   r   r   	self_attnr^   mlprt   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r   s      r5   rI   Gemma3DecoderLayer.__init__  s    !--"(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'..33$33r4   last_cache_positionz4.53.0)versionr<   position_embeddings_globalposition_embeddings_localr   r   r   r  	use_cacher  r   c
                    U R                   (       Ga8  UGb4  [        U	R                  S   U R                  5      nU R                  R
                  S:X  a  US S 2U* S 24   nO[        R                  " UR                  5      R                  n[        R                  " [        R                  " U[        R                  S9U R                  * S9n[        R                  " XU5      nU	S   U-
  S-   n[        R                  " USS9n[        R                  " [        XR                  S   5      UR                   S9nX-  nUS S 2S S 2S S 2U4   nUnU R#                  U5      nU R$                  R                   (       a  UnOUnU R$                  " S
UUUUUUUU	S	.U
D6u  nnU R'                  U5      nUU-   nUnU R)                  U5      nU R+                  U5      nU R-                  U5      nUU-   nU4nU(       a  UU4-  nU$ )Nr   r  r   diagonalr}   r!   )minr   )r<   r   r   r   r   r  r$  r  r*   )r   maxr   r   r`   r
  r0   finforU   r(  tril	ones_liker   whereclamparanger   r  r  r  r  r  r  )rL   r<   r"  r#  r   r   r   r  r$  r  r   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualr   self_attn_weightsoutputss                       r5   rR   Gemma3DecoderLayer.forward  s    ???~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@^!\'+.??!CV3  %||)+?+?+CD^MbMb  &!/1a0E!F ,,]; >>$$";"<+/>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr4   )r`   re   r  r   r   r  r  r  r  r  r   )NNNFFN)r+   r,   r-   r.   r#   rX   rI   r   r0   rZ   r   r  r   r   r   r1   rR   r3   r[   r\   s   @r5   r  r    s   4/ 4C 4 *H= 2637*.,1$)59K||K %*LLK $)<<	K
 !.K u//0K !K $D>K D>K !!1!12K 
u  (51B1BEDUDU1U+V"WW	XK >Kr4   r  c                   P    \ rS rSr\rSrSr/ SQrS/r	Sr
SrSrSrSrSrSrS rSrg)	Gemma3PreTrainedModeli   T)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr;   c                 ,   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       a%  UR                   R                  R                  5         g g )Nr  )r   stdrW   )r`   initializer_ranger   rg   rh   Conv2drT   datanormal_rd   zero_	EmbeddingrC   rt   fill_Gemma3MultiModalProjectormm_input_projection_weight)rL   r   rA  s      r5   _init_weights#Gemma3PreTrainedModel._init_weights  s   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> ...MM$$S) 9::--2288: ;r4   r*   N)r+   r,   r-   r.   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendrK  r3   r*   r4   r5   r;  r;    sT    L&*# $5"5!N  $!"&;r4   r;  c                   :  ^  \ rS rSr\rS\4U 4S jjrS rS r\	\
         SS\\R                     S\\R                     S\\R                     S	\\   S
\\R                      S\\   S\\   S\\   S\\R                     S\\   S\4S jj5       5       r\R,                  " 5        SS\\R                  S4   S\R                  S\R                  S	\S\4
S jj5       r\S\R                  S\S\S\R6                  S\R                  S\4S j5       rSrU =r$ )Gemma3TextModeli  r`   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        US9U l        SU l        [*        R,                  " U5      nUR.                  Ul        SS0Ul        [%        US9U l        U R7                  5         g s  snf )N      ?)rD   r  r`   Fr   r   )rH   rI   pad_token_idrC   
vocab_sizer?   re   r`   embed_tokensrg   
ModuleListrangenum_hidden_layersr  layersrt   r   normr   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar   rotary_emb_local	post_initr   s      r5   rI   Gemma3TextModel.__init__  s    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	/v>&+# v&"77*I6 5V D 	 es    Ec                     U R                   $ rQ   r`  r   s    r5   get_input_embeddings$Gemma3TextModel.get_input_embeddings.  s       r4   c                     Xl         g rQ   rp  rL   r   s     r5   set_input_embeddings$Gemma3TextModel.set_input_embeddings1  s    !r4   rO   r   r   r;   inputs_embedsr$  r  output_hidden_statesr  flash_attn_kwargsr   c
                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       aC  Uc@  U R                  (       d/  UR                  u  pn[        U R                   UUUR                  S9nU	cD  Ub  UR                  5       OSn[        R                  " UXR                  S   -   UR                   S9n	Uc  U	R#                  S5      nU R%                  UUU	UU5      nUnU R'                  UU5      nU R)                  UU5      nU(       a  SOS nU(       a  SOS nU R*                  S U R                   R,                    H  nU(       a  UU4-  nU R
                  (       a@  U R                  (       a/  U R/                  [1        UR2                  40 U
D6UUUUUUUUU	5
      nOU" U4UUUUUUUU	S	.U
D6nUS   nU(       d  M  UUS   4-  nM     U R5                  U5      nU(       a  UU4-  n[7        UUUUS
9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenrU   r   r!   r)  r*   )r"  r#  r   r   r   r  r$  r  )last_hidden_stater;   r<   r=   )r`   r  rx  r$  
ValueErrorrg  r   r  r  r`  r   r   rU   get_seq_lengthr0   r0  r   r   _update_causal_maskrf  rl  rd  rc  _gradient_checkpointing_funcr   __call__re  r   )rL   rO   r   r   r;   rw  r$  r  rx  r  ry  
batch_sizer  _past_seen_tokensr   r<   r"  r#  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          r5   rR   Gemma3TextModel.forward4  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#))	O !CRC^==?de"\\  #6#6q#99$++N )33A6L..
 & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!**t}} $ A AM22H6GH!.- #%"! !.!!/I.G#.!-#2&7'#1! (! *!,M  =#3"55E JH 		-0-!11&+++%	
 	
r4   r$   input_tensorc           
         U R                   R                  S:X  a  U$ U R                   R                  S:X  a,  [        U[        R                  5      (       a  [        U5      nU$ UR                  UR                  pvUR                  S   n[        U[        [        45      (       a  UR                  5       n	O!Ub  UR                  S   OUR                  S   n	U R                  UUU	UUUUR                  S   S9n
U
$ )Nr  flex_attentionr!   r}   r   sequence_lengthtarget_lengthrU   r   r  r  )r`   r
  r   r0   rZ   r%   rU   r   r   r   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rL   r   r  r  r;   r  rU   r   r  r  r   s              r5   r  #Gemma3TextModel._update_causal_mask  s     ;;++/BB!!;;++/??.%,,77!<^!L!!$**L,?,?v&,,Q/o['ABB+??AM8F8RN004XdXjXjklXmM PP+')#))!, Q 
 r4   r  r  rU   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ 	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   
fill_valuerU   r   r!   r&  r)  r}   r   rv   r0   r+  r(  fullr   triur0  r   r   cloner   rS   masked_fillr   r  r  rU   r  r  r   r   r2  mask_lengthpadding_masks              r5   r  EGemma3TextModel._prepare_4d_causal_attention_mask_with_cache_position  }   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r4   )r`  rg  rd  re  rC   rf  rl  r_  )	NNNNNNNNNF)r+   r,   r-   r.   r#   rM  rI   rq  ru  r   r   r   r0   r  rZ   r   r1   r   r   r   r   rR   r   r	   r  staticmethodrX   rU   r  r3   r[   r\   s   @r5   rZ  rZ    s   #L/ 4!"  1515371559$(,0/359t
E,,-t
 !.t
 u//0	t

 "+.t
   1 12t
 D>t
 $D>t
 'tnt
 !!1!12t
 $$89t
 
!t
  t
l ]]_ #($ellK78$ ll$ 	$
 %$  $ $L 444 4 {{	4
 4 4 4r4   rZ  c                     ^  \ rS rSrS/rSS0rSS/S/40r\rSr	S\4U 4S	 jjr
S
 rS rS rS rS rS r\\           SS\\R*                     S\\R,                     S\\R*                     S\\   S\\R0                     S\\R*                     S\\   S\\   S\\   S\\R*                     S\\\R,                  4   S\4S jj5       5       r       S U 4S jjrSrU =r $ )!Gemma3ForCausalLMi  lm_head.weightlm_headcolwise_repr<   r:   language_modelr`   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g rb   )
rH   rI   rZ  modelr_  rg   rh   re   r  rm  rn   s     r5   rI   Gemma3ForCausalLM.__init__  sU     $V,
 ++yy!3!3V5F5FUS 	r4   c                 .    U R                   R                  $ rQ   r  r`  r   s    r5   rq  &Gemma3ForCausalLM.get_input_embeddings  s    zz&&&r4   c                 $    XR                   l        g rQ   r  rt  s     r5   ru  &Gemma3ForCausalLM.set_input_embeddings  s    "'

r4   c                     U R                   $ rQ   r  r   s    r5   get_output_embeddings'Gemma3ForCausalLM.get_output_embeddings"      ||r4   c                     Xl         g rQ   r  rL   new_embeddingss     r5   set_output_embeddings'Gemma3ForCausalLM.set_output_embeddings%      %r4   c                     Xl         g rQ   r  )rL   decoders     r5   set_decoderGemma3ForCausalLM.set_decoder(  s    
r4   c                     U R                   $ rQ   r  r   s    r5   get_decoderGemma3ForCausalLM.get_decoder+  s    zzr4   rO   r   r   r;   rw  labelsr$  r  rx  r  logits_to_keepr   c                 F   U R                   (       aG  U R                  R                  S:w  a-  [        R	                  SU R                  R                   S35        Ub  UOU R                  R
                  nU	b  U	OU R                  R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bH  UU R                  R                  -  n[        R                  " U5      nUU R                  R                  -  nSnUb  U R                   " UX`R"                  40 UD6n[%        UUUR&                  UR(                  UR*                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Gemma3ForCausalLM

>>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```r  zhIt is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rO   r   r   r;   rw  r$  r  rx  r  )r9   r:   r;   r<   r=   r*   )r   r`   r
  r  r  r  rx  r  r~  r   rX   slicer  final_logit_softcappingr0   r   loss_functionr_  r   r;   r<   r=   )rL   rO   r   r   r;   rw  r  r$  r  rx  r  r  loss_kwargsr8  r<   slice_indicesr:   r9   s                     r5   rR   Gemma3ForCausalLM.forward.  s   P ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%%ffooUUD%#33!//))
 	
r4   c	                   > [         TU ]  " U4UUUUUUUS.U	D6n
Uc  U
R                  SS 5      n[        U[        5      (       a  UR
                  S:X  a  U R                  R                  S:X  d  U
S   b"  U
S   R                  u  pnU
S   R                  nO U
S   R                  u  pU
S   R                  nU R                  R                  UUUR                  5       U R                  R                  R                  UUUS9nX:S'   U
$ )	N)r;   r   rw  r  r   r$  r  r  r   r  rw  rO   r  r   )rH   prepare_inputs_for_generationpopr   r   ndimr`   r
  r   r   r  r  r  r  rT   rU   )rL   rO   r;   r   rw  r  r   r$  r  r   model_inputsr  r  r  r   rM   s                  r5   r  /Gemma3ForCausalLM.prepare_inputs_for_generation  s2    w<

+)')%)

 

 !  !148A 44##q(KK448KKO,81=o1N1T1T.
Q%o6==.:;.G.M.M+
%k299!ZZ]] /-AACll))//-% ^ N .<)*r4   )r  r  r_  )NNNNNNNNNNr   )NNNNNTN)!r+   r,   r-   r.   _tied_weights_keys_tp_plan_pp_planr#   rM  rN  rI   rq  ru  r  r  r  r  r   r   r   r0   r  rZ   r   r1   r   r	   rX   r   rR   r  r3   r[   r\   s   @r5   r  r    s   *+=)H_-z:;H#L(/ '(&  1515371559-1$(,0/35934P
E,,-P
 !.P
 u//0	P

 "+.P
   1 12P
 ))*P
 D>P
 $D>P
 'tnP
 !!1!12P
 c5<</0P
 
 P
  P
j 4 4r4   r  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )rI  i  r`   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr  r\  )kernel_sizestride)rH   rI   rg   ry   r0   rz   vision_configre   text_configrJ  rt   layer_norm_epsmm_soft_emb_normrX   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrn   s     r5   rI   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r4   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr!   r   )r   r   r   r  r   r  flattenr  r0   r   rJ  r   )	rL   r  r  r  
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r5   rR   !Gemma3MultiModalProjector.forward  s    $2$8$8!
z"0":":1a"@"9"A"AD$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r4   )r  r  rJ  r  r  r  )r+   r,   r-   r.   r"   rI   r0   rZ   rR   r3   r[   r\   s   @r5   rI  rI    s)    \| \ @ell @ @r4   rI  zx
    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
    )custom_introc            !         ^  \ rS rSrSS0rS\4U 4S jjrS rS r SS\	4S	 jjr
S
\R                  S\R                  4S jr\\             SS\R                   S
\R"                  S\\R                     S\\R                      S\\\\R"                     \4      S\\R                      S\\R                      S\\R"                     S\\R                      S\\	   S\\	   S\\	   S\\	   S\\\4   4S jj5       5       rSrU =r$ )Gemma3Modeli  zlanguage_model.modelr  r`   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )Nr]  r}   )rH   rI   r    from_configr  vision_towerrI  multi_modal_projectorr  r_  r  r`   r^  rm  )rL   r`   r  rM   s      r5   rI   Gemma3Model.__init__  s     %119M9MN%>v%F" ,,77"..f6H6HI,8<8P8P8\DKK44bdr4   c                 6    U R                   R                  5       $ rQ   )r  rq  r   s    r5   rq   Gemma3Model.get_input_embeddings  s    ""7799r4   c                 :    U R                   R                  U5        g rQ   )r  ru  rt  s     r5   ru   Gemma3Model.set_input_embeddings  s    007r4   is_trainingc                 F   U R                   R                  R                  S:X  a  U$ Ub  UR                  5       S:X  a  U$ [	        U[
        5      n[        R                  " U R                  5      R                  nUR                  S S u  pU(       a  UR                  5       nO_[	        U[        5      (       a  UR                  5       nO9[	        U[        R                  5      (       a  UR                  S   O
US   U
-   S-   nUb  UR                  5       S:X  a  U$ [        R                  " X4XR                  UR                  S9nU
S:w  a  [        R                   " USS9nU[        R"                  " XR                  S	9UR%                  SS5      :  -  nUS S S S 2S S 24   R'                  U	SSS5      nUGbZ  U
S:w  GaS  UR)                  S5      UR)                  S5      :H  nS
XS:H  '   US:H  nU[*        R,                  R/                  USSS9S S 2S S24   ) -  n[        R0                  " UR3                  5       SS9S-
  n[        R4                  " UU[        R6                  " US5      5      nUR)                  S5      UR)                  S5      :H  nS
UUS:H  '   UU-  R)                  S5      R9                  UR                  [        R:                  S9nUR=                  5       nUS S 2S S 2S S 2S U
24   R?                  US5      US S 2S S 2S S 2S U
24'   Ub  UR=                  5       nUR                  S   nUS S 2S S 2S S 2S U24   US S 2S S S S 24   R9                  UR                  5      -   nUS:H  nUS S 2S S 2S S 2S U24   R?                  UU5      US S 2S S 2S S 2S U24'   U$ )Nr  r  r   r}   r   r!   r  r&  r)  F)r!   r   )r   r   r   r  ) r`   r  r
  rv   r   r   r0   r+  rU   r(  r   r  r   rZ   r  r   r  r0  r   r   r   rg   r   padcumsumrX   r.  	full_likerS   r   r  r  )rL   r   token_type_idsr;   r  r  r  using_static_cacher2  inputs_lead_dimr  r  r   token_type_maskis_imagenew_image_startimage_group_idssame_image_mask
image_maskr  r  s                        r5   r  Gemma3Model._update_causal_mask  s    ;;""77;NN!!%.*<*<*>!*C "!'EKK

+//	+7+=+=bq+A(+??AM55+??AM nell;; $$R(#A&81<  %.*<*<*>!*C!!jj,**]k]r]r

 a**[1=Ku||M:O:OPSaSiSijlnoSppp!$a"23::?ArSUV %/Q*>,66q9^=U=UVW=XXO38Oa/0 &*H&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO-77:o>W>WXY>ZZO5:OOr12)O;FFqILL[M_M_glgqgqLrJ%++-K5@AqJZ?JZAZ5[5g5gC6K1a!1/!112 %%++-K(..r2K 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r4   pixel_valuesr   c                 Z    U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r  )r  r~  r  )rL   r  r  image_featuress       r5   get_image_featuresGemma3Model.get_image_featuresG  s3     ***EWW33NCr4   rO   r   r   r;   r  r  rw  r  r$  r  rx  return_dictc                 D   USL USL-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUSL=(       a    U	SLnUbR  U R                  R
                  U R                  :  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUGbx  U R                  U5      nUcY  XR                  5       " [        R                  " U R                  R
                  [        R                   UR                  S95      :H  nOQXR                  R
                  :H  R#                  S5      nUR%                  U5      R'                  UR                  5      n[)        5       (       ds  UU   R+                  5       UR+                  5       :w  aN  UR-                  SS9R-                  SS9S   n[        S	U S
UR                  S   UR                  S   -   S35      eUR'                  UR                  UR.                  5      nUR1                  UU5      nU R3                  X6XWX5      nU R4                  " SUUUUU
UUSUS.	UD6n[7        UR8                  U
(       a  UR:                  OSUR<                  UR>                  Ub  WS9$ SS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```Nr{  r   r!   r)  )rU   r   r}   r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r;   rw  r$  r  rx  r  r  )r~  r;   r<   r=   r)   r*   ) r  r`   r  rx  use_return_dictimage_token_idr_  r  rq  r  r0   r0  r   r   r  rK   longr   	expand_asrS   r   numelsumrU   masked_scatterr  r  r'   r~  r;   r<   r=   )rL   rO   r  r   r   r;   r  r  rw  r  r$  r  rx  r  	lm_kwargsr  special_image_maskllm_input_idsr  r  image_tokens_in_textr   r8  s                          r5   rR   Gemma3Model.forwardU  sO   \ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$D0GV45G  T[[%?%?4??%R!*kk.H.H!H%OO-M01M,-%M  557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN %26O6O6QLL!;!;5::VcVjVjk7 &" '0;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i"+---@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M..O]
 %% 
&%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r4   )r  r  r^  r  r_  r  )NNNNNNNNNNNNN)r+   r,   r-   r.   _checkpoint_conversion_mappingr"   rI   rq  ru  r   r  r0   rZ   r  r   r   r  r1   r   r	   r   r   r   r'   rR   r3   r[   r\   s   @r5   r  r    s    '=>N%O"
| 
:8 "N N`u||    '+*.1537KO595959-1$(,0/3&*t
##t
 ''t
 !.	t

 u//0t
 "%U->->(?(F"GHt
 !!1!12t
 !!1!12t
   1 12t
 ))*t
 D>t
 $D>t
 'tnt
 d^t
  
u//	0!t
  t
r4   r  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c            "         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S r
S r\S 5       r\S 5       r\S 5       r\              S(S\R$                  S\R&                  S\\R*                     S\\R$                     S\\\\R&                     \4      S\\R$                     S\\R$                     S\\R&                     S\\R$                     S\\   S\\   S\\   S\\   S\\\R*                  4   S\\\4   4S  jj5       r          S)U 4S! jjr\S\R*                  S"\S#\S$\R@                  S\R*                  S%\4S& j5       r!S'r"U =r#$ )*Gemma3ForConditionalGenerationi  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorr  )z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headr  r`   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g rb   )rH   rI   r  r  rg   rh   r  re   r_  r  rm  rn   s     r5   rI   'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr4   c                 6    U R                   R                  5       $ rQ   )r  rq  r   s    r5   rq  3Gemma3ForConditionalGeneration.get_input_embeddings  s    zz..00r4   c                 :    U R                   R                  U5        g rQ   )r  ru  rt  s     r5   ru  3Gemma3ForConditionalGeneration.set_input_embeddings  s    

''.r4   c                     U R                   $ rQ   r  r   s    r5   r  4Gemma3ForConditionalGeneration.get_output_embeddings  r  r4   c                     Xl         g rQ   r  r  s     r5   r  4Gemma3ForConditionalGeneration.set_output_embeddings  r  r4   c                 .    U R                   R                  $ rQ   )r  r  r   s    r5   r  -Gemma3ForConditionalGeneration.language_model  s    zz(((r4   c                 .    U R                   R                  $ rQ   )r  r  r   s    r5   r  +Gemma3ForConditionalGeneration.vision_tower  s    zz&&&r4   c                 .    U R                   R                  $ rQ   )r  r  r   s    r5   r  4Gemma3ForConditionalGeneration.multi_modal_projector  s    zz///r4   rO   r  r   r   r;   r  r  rw  r  r$  r  rx  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)rO   r  r  r   r   r;   rw  r$  r  r  rx  r  r  r   .r}   r!   )r9   r:   r;   r<   r=   r)   r*   )r`   r  rx  r  r  r   rX   r  r  rY   r   rS   r   r   rg   CrossEntropyLossr  r  r_  r7   r;   r<   r=   r)   )rL   rO   r  r   r   r;   r  r  rw  r  r$  r  rx  r  r  r  r8  r<   r  r:   r9   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr   s                               r5   rR   &Gemma3ForConditionalGeneration.forward  s~   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%))%+'/!5#)
 
"  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r4   c                   > [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   US L=(       a    US LnUS   S:X  a>  [        U[        5      (       a)  Ub  UOUnU R                  R                  XxX$X5      nUUS'   U$ )N)r;   rw  r   r   r  r$  r  r  r   r  r   )rH   r  r   r   r  r  )rL   rO   r;   rw  r  r   r  r   r  r$  r  r  r   r  r  r  r   rM   s                    r5   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generationz  s      w<
+')%)))
 
 !!+7($D0GV45G!!j+&N&N,9,E=9L**88Q]K .9L)*r4   r  r  rU   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ r  r  r  s              r5   r  TGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  r  r4   )r  r  )NNNNNNNNNNNNNr   )
NNNNNNNTNN)$r+   r,   r-   r.   r   r  r"   rI   rq  ru  r  r  propertyr  r  r  r   r0   r  r1   r   rZ   r	   r   r   r   rX   r   r7   rR   r  r  rU   r  r3   r[   r\   s   @r5   r"  r"    ss    "8-"?#,	&" ++| 1/& ) ) ' ' 0 0  '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B )V 444 4 {{	4
 4 4 4r4   r"  )r;  rZ  r  r"  r  )Nr!   )r  NN)Qrh  collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r	   r0   torch.nnrg   activationsr   cache_utilsr   r   r   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor    configuration_gemma3r"   r#   !torch.nn.attention.flex_attentionr$   integrations.flex_attentionr%   
get_loggerr+   r  r'   r7   rG  r?   Moduler^   rt   r   r   r   rZ   rX   r   rY   r   r   r  r;  rZ  r  rI  r  r"  __all__r*   r4   r5   <module>rW     s  ,  $ !  / /   ! : : ) B O K F &  1  @  !!;J 
		H	% < 7 < <@ $<; $< $<N
SBLL 
S		  =BII =(<BII <D(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %F`)bii `)F[ [| !;O !; !;H w+ w wt j- j jZ!@		 !@H 
i
' i

i
X 
G%:O G
GTr4   