
    fTh                     (   S SK r S SKJr  S SKJr  S SKJr  S SKJrJ	r	J
r
JrJrJr  S SKrS SKJr  S SKrSSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  SSK5J6r6J7r7J8r8J9r9  SSK:J;r;  \&Rx                  " \=5      r> " S S\*5      r? " S S\5      r@\ " S S\95      5       rA\ " S S\65      5       rB " S S\R                  5      rD " S S \.5      rE " S! S"\15      rF " S# S$\25      rG " S% S&\,5      rH " S' S(\R                  5      rJSrK " S) S*\05      rL " S+ S,\/5      rM " S- S.\-5      rN " S/ S0\R                  5      rO " S1 S2\85      rP " S3 S4\75      rQ/ S5QrRg)6    N)Callable)	dataclass)partial)AnyDictListOptionalTupleUnion   )CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                   D   ^  \ rS rSrSrSr        SU 4S jjrSrU =r$ )Gemma3TextConfig;   a   
This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Gemma3Text-7B.
e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    vocab_size (`int`, *optional*, defaults to 262208):
        Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Gemma3TextModel`]
    hidden_size (`int`, *optional*, defaults to 2304):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 9216):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 26):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    head_dim (`int`, *optional*, defaults to 256):
        The attention head dimension.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
        if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
    max_position_embeddings (`int`, *optional*, defaults to 131072):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 2):
        Beginning of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 1000000.0):
        The base period of the RoPE embeddings.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        Scaling factor used on the attention scores
    sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
        size of the sliding window.
    final_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the attention scores.
    cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    rope_local_base_freq (float, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings for local attention.
    sliding_window_pattern (`int`, *optional*, defaults to 6):
        Pattern for the sliding window attention.

```python
>>> from transformers import Gemma3TextModel, Gemma3TextConfig
>>> # Initializing a Gemma3Text gemma3_text-7b style configuration
>>> configuration = Gemma3TextConfig()
>>> # Initializing a model from the gemma3_text-7b style configuration
>>> model = Gemma3TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
    rope_local_base_freq (float, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings for local attention.
    sliding_window_pattern (`int`, *optional*, defaults to 6):
        Pattern for the sliding window attention.
gemma3_textc	                 b   > [         T
U ]  " U 40 U	D6  X@l        XPl        X0l        [        U 5        g N)super__init__rope_local_base_freqsliding_window_patternrope_scalingr   )self
vocab_size
rope_thetar5   r3   r4   max_position_embeddingsfinal_logit_softcappingattn_logit_softcappingsuper_kwargs	__class__s             a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.pyr2   Gemma3TextConfig.__init__   s1     	..$8!&<#(t$    )r3   r5   r4   )i@  g    .ANg     @   i   NN)	__name__
__module____qualname____firstlineno____doc__
model_typer2   __static_attributes____classcell__r=   s   @r>   r,   r,   ;   s5    wr J %  ' $#% %r@   r,   c                      ^  \ rS rSrSrSrSSSS.r\\S.r	       SS	\
\\\\\4   4      S
\
\\\\\4   4      S\S\S\S\S\4U 4S jjjrSrU =r$ )Gemma3Config   a  
This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the PaliGemma-2B.

e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
        The config object of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*):
        Custom vision config or dict.
    mm_tokens_per_image (`int`, *optional*, defaults to 256):
        The number of tokens per image embedding.
    boi_token_index (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_index (`int`, *optional*, defaults to 256000):
        The end-of-image token index to wrap the image prompt.
    image_token_index (`int`, *optional*, defaults to 262144):
        The image token index to encode the image prompt.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


Example:

```python
>>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

>>> # Initializing a Siglip-like vision config
>>> vision_config = SiglipVisionConfig()

>>> # Initializing a Gemma3 Text config
>>> text_config = Gemma3TextConfig()

>>> # Initializing a Gemma3 gemma-3-4b style configuration
>>> configuration = Gemma3Config(vision_config, text_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3TextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configrU   rV   mm_tokens_per_imageinitializer_rangec                   > Uc   [        5       n[        R                  S5        O [        U[        5      (       a  [        S0 UD6n[        U[        5      (       a  [        S0 UD6nO"Uc  [        5       n[        R                  S5        Xl        X l        X0l        X@l	        XPl
        X`l        Xpl        [        T	U ]8  " S0 UD6  g )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config. )r,   loggerinfo
isinstancedictr*   rU   rV   rW   rP   rQ   rO   rX   r1   r2   )
r6   rU   rV   rW   rP   rQ   rO   rX   kwargsr=   s
            r>   r2   Gemma3Config.__init__  s     *,KKKZ[T***9[9KmT**.??M".0MKK`a&*#6 ..!2!2"6"r@   )rP   rQ   rO   rX   rW   rU   rV   )NN   i i  i   g{Gz?)rB   rC   rD   rE   rF   rG   attribute_mapr,   r*   sub_configsr	   r   r   strr   intfloatr2   rH   rI   rJ   s   @r>   rL   rL      s    .` J-))M (+K JNMQ#&&&!(#'#e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# #r@   rL   c                       \ rS rSrSrg)Gemma3ModelOutputWithPasti*  rZ   NrB   rC   rD   rE   rH   rZ   r@   r>   rh   rh   *      r@   rh   c                       \ rS rSrSrg)Gemma3CausalLMOutputWithPasti/  rZ   Nri   rZ   r@   r>   rl   rl   /  rj   r@   rl   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbeddingi4  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )Nrr   F)
persistent)r1   r2   register_buffertorchtensor)r6   ro   rp   rq   rr   r=   s        r>   r2   &Gemma3TextScaledWordEmbedding.__init__9  s1    D]ELL,ERWXr@   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ r0   )r1   forwardrr   toweightdtype)r6   ry   r=   s     r>   r{   %Gemma3TextScaledWordEmbedding.forward=  s2    wy)D,<,<,?,?@Q@Q,RRRr@   rZ   )      ?)rB   rC   rD   rE   rF   re   rf   r2   rv   Tensorr{   rH   rI   rJ   s   @r>   rn   rn   4  sM    Ys Y3 YS Y_d Y YS S Sr@   rn   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	Gemma3MLPiA  configc                 $   > [         TU ]  U5        g r0   r1   r2   r6   r   r=   s     r>   r2   Gemma3MLP.__init__B       r@   rZ   rB   rC   rD   rE   r,   r2   rH   rI   rJ   s   @r>   r   r   A  s    !/ ! !r@   r   c                   8   ^  \ rS rSrSS\S\4U 4S jjjrSrU =r$ )Gemma3RMSNormiF  dimepsc                 "   > [         TU ]  5         g r0   r   )r6   r   r   r=   s      r>   r2   Gemma3RMSNorm.__init__G  s    r@   rZ   )gư>)	rB   rC   rD   rE   re   rf   r2   rH   rI   rJ   s   @r>   r   r   F  s    C e  r@   r   c                   4   ^  \ rS rSrSS\4U 4S jjjrSrU =r$ )Gemma3RotaryEmbeddingiK  r   c                 $   > [         TU ]  U5        g r0   r   )r6   r   devicer=   s      r>   r2   Gemma3RotaryEmbedding.__init__L  r   r@   rZ   r0   r   rJ   s   @r>   r   r   K  s    !/ ! !r@   r   c                   $  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\	\R                     S\	\
   S	\	\R                     S
\\   S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )Gemma3AttentioniQ  r   	layer_idxc                 6  > [        US-   UR                  -  5      U l        [        TU ]  5         U R                  (       a  UR
                  OS U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N   )r   r   )boolr4   
is_slidingr1   r2   sliding_windowr   head_dimrms_norm_epsq_normk_normr6   r   r   r=   s      r>   r2   Gemma3Attention.__init__R  so    	A1N1NNO7;f33D#V=P=PQ#V=P=PQr@   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionr_   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  p[        XX5      u  pUb}  UUUU R                  S.nUR                  XU R                  U5      u  pUbJ  U R                  R                  S:X  a0  UR                   S   nU
S S 2S S 2S U2S S 24   US S 2S S 2S U2S S 24   p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR!                  SS	5      (       a  ["        R%                  S
5        O[&        U R                  R                     nUb  UR)                  U	5      nU" U U	U
UU4U R*                  (       a  U R,                  OSU R.                  U R                  S.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )Nr   r   )sincosr   r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscalingr   )shaper   q_projview	transposek_projv_projr   r   r$   r   updater   r   _attn_implementationr%   getr[   warning_oncer   r|   trainingattention_dropoutr   reshape
contiguouso_proj)r6   r   r   r   r   r   r_   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsseq_lenattention_interfaceattn_outputattn_weightss                      r>   r{   Gemma3Attention.forward[  s|    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ % "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL(?;;++w6{{//69fjjI\^c>d>d##L '>dkk>^>^&_#%+..|<N$7
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r@   )r   r   r   r   )NN)rB   rC   rD   rE   r,   re   r2   rv   r   r	   r   
LongTensorr   r   tupler{   rH   rI   rJ   s   @r>   r   r   Q  s    R/ RC R +/59@)||@) #\\@) !.	@)
 !@) !!1!12@) -.@) 
u||Xell3XeELL>Q5RR	S@) @)r@   r   c                   |  ^  \ rS rSrS\S\4U 4S jjr\" SSS9      SS\R                  S	\R                  S
\R                  S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )Gemma3DecoderLayeri  r   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        U R                  R                  U l        UR                   U l        g )N)r   r   r   )r1   r2   r   hidden_sizer   r   	self_attnr   mlpr   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r   s      r>   r2   Gemma3DecoderLayer.__init__  s    !--"(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'..33$33r@   last_cache_positionz4.53.0)versionr   position_embeddings_globalposition_embeddings_localr   position_idsr   r   	use_cacher   r   c
                    U R                   (       Ga8  UGb4  [        U	R                  S   U R                  5      nU R                  R
                  S:X  a  US S 2U* S 24   nO[        R                  " UR                  5      R                  n[        R                  " [        R                  " U[        R                  S9U R                  * S9n[        R                  " XU5      nU	S   U-
  S-   n[        R                  " USS9n[        R                  " [        XR                  S   5      UR                   S9nX-  nUS S 2S S 2S S 2U4   nUnU R#                  U5      nU R$                  R                   (       a  UnOUnU R$                  " S
UUUUUUUU	S	.U
D6u  nnU R'                  U5      nUU-   nUnU R)                  U5      nU R+                  U5      nU R-                  U5      nUU-   nU4nU(       a  UU4-  nU$ )Nr   r   r~   diagonalr   r   )minr   )r   r   r   r   r   r   r   r   rZ   )r   maxr   r   r   r   rv   finfor~   r   tril	ones_liker   whereclamparanger   r   r   r   r   r   r   )r6   r   r   r   r   r   r   r   r   r   r_   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualr   self_attn_weightsoutputss                       r>   r{   Gemma3DecoderLayer.forward  s    ???~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@^!\'+.??!CV3  %||)+?+?+CD^MbMb  &!/1a0E!F ,,]; >>$$";"<+/>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr@   )r   r   r   r   r   r   r   r   r   r   r   )NNNFFN)rB   rC   rD   rE   r,   re   r2   r   rv   r   r	   r   r   r   r   FloatTensorr{   rH   rI   rJ   s   @r>   r   r     s   4/ 4C 4 *H= 2637*.,1$)59K||K %*LLK $)<<	K
 !.K u//0K !K $D>K D>K !!1!12K 
u  (51B1BEDUDU1U+V"WW	XK >Kr@   r   c                   &    \ rS rSrSr/ SQrS rSrg)Gemma3PreTrainedModeli   )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                 ,   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       a%  UR                   R                  R                  5         g g )Nr   )meanstdr   )r   rX   r]   nnLinearConv2dr}   datanormal_biaszero_	Embeddingrq   r   fill_Gemma3MultiModalProjectormm_input_projection_weight)r6   moduler  s      r>   _init_weights#Gemma3PreTrainedModel._init_weights  s   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> ...MM$$S) 9::--2288: ;r@   rZ   N)rB   rC   rD   rE   base_model_prefix_no_split_modulesr  rH   rZ   r@   r>   r   r     s    ;r@   r   c                      ^  \ rS rSr\rS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )Gemma3TextModeli  r   c                 ,  > [         TU ]  U5        [        UR                  UR                  U R
                  U R                  R                  S-  S9U l        [        R                  " U5      nUR                  Ul        SS0Ul        [        US9U l        g )N      ?)rr   	rope_typedefault)r   )r1   r2   rn   r7   r   rq   r   embed_tokenscopydeepcopyr3   r8   r5   r   rotary_emb_localr   s     r>   r2   Gemma3TextModel.__init__  s      :v1143C3CQUQ\Q\QhQhjmQm
 v&"77*I6 5V Dr@   ry   r   r   past_key_valuesinputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr   c
                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       aC  Uc@  U R                  (       d/  UR                  u  pn[        U R                   UUUR                  S9nU	cD  Ub  UR                  5       OSn[        R                  " UXR                  S   -   UR                   S9n	Uc  U	R#                  S5      nU R%                  UUU	UU5      nUnU R'                  UU5      nU R)                  UU5      nU(       a  SOS nU(       a  SOS nU R*                  S U R                   R,                    H  nU(       a  UU4-  nU R
                  (       a@  U R                  (       a/  U R/                  [1        UR2                  40 U
D6UUUUUUUUU	5
      nOU" U4UUUUUUUU	S	.U
D6nUS   nU(       d  M  UUS   4-  nM     U R5                  U5      nU(       a  UU4-  n[7        UUUUS
9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenr~   r   r   r   rZ   )r   r   r   r   r   r   r   r   )last_hidden_stater  r   
attentions)r   r   r   r   
ValueErrorgradient_checkpointingr   r[   r   r  r   r   r~   get_seq_lengthrv   r   r   	unsqueeze_update_causal_mask
rotary_embr  layersnum_hidden_layers_gradient_checkpointing_funcr   __call__normr   )r6   ry   r   r   r  r  r   r   r   r   r!  
batch_sizer   _past_seen_tokenscausal_maskr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          r>   r{   Gemma3TextModel.forward+  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#))	O !CRC^==?de"\\  #6#6q#99$++N )33A6L..
 & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!**t}} $ A AM22H6GH!.- #%"! !.!!/I.G#.!-#2&7'#1! (! *!,M  =#3"55E JH 		-0-!11&+++%	
 	
r@   )r  r  )	NNNNNNNNN)rB   rC   rD   rE   r,   config_classr2   r	   rv   r   r   r   r   r   r   r   r   r{   rH   rI   rJ   s   @r>   r  r    s   #LE/ E" 1515371559$(,0/359t
E,,-t
 !.t
 u//0	t

 "+.t
   1 12t
 D>t
 $D>t
 'tnt
 !!1!12t
 $$89t
 
!t
 t
r@   r  c                   8   ^  \ rS rSr\rSrS\4U 4S jjrSrU =r	$ )Gemma3ForCausalLMi  language_modelr   c                 D   > [         TU ]  U5        [        U5      U l        g r0   )r1   r2   r  modelr   s     r>   r2   Gemma3ForCausalLM.__init__  s     $V,
r@   )rA  )
rB   rC   rD   rE   r,   r<  r  r2   rH   rI   rJ   s   @r>   r>  r>    s     #L(-/ - -r@   r>  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r  i  r   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr   r  )kernel_sizestride)r1   r2   r  	Parameterrv   zerosrV   r   rU   r  r   layer_norm_epsmm_soft_emb_normre   
image_size
patch_sizepatches_per_imagerW   tokens_per_siderE  	AvgPool2davg_poolr   s     r>   r2   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r@   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr   r   )r   r   r   rM  r   rP  flattenrJ  rv   matmulr  type_as)	r6   rR  r3  r4  
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r>   r{   !Gemma3MultiModalProjector.forward  s    $2$8$8!
z"0":":1a"@"9"A"AD$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r@   )rP  rE  r  rJ  rM  rN  )rB   rC   rD   rE   rL   r2   rv   r   r{   rH   rI   rJ   s   @r>   r  r    s)    \| \ @ell @ @r@   r  c            !          \ rS rSrS\R
                  S\R
                  4S jr SS\4S jjr\	\
             SS\R                  S\R                  S	\\R
                     S
\\R                     S\\\\R                     \4      S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       rSrg)Gemma3Modeli  pixel_valuesr   c                 Z    U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r_  )vision_towerr&  multi_modal_projector)r6   r_  rR  image_featuress       r>   get_image_featuresGemma3Model.get_image_features  s3     ***EWW33NCr@   is_trainingc                 F   U R                   R                  R                  S:X  a  U$ Ub  UR                  5       S:X  a  U$ [	        U[
        5      n[        R                  " U R                  5      R                  nUR                  S S u  pU(       a  UR                  5       nO_[	        U[        5      (       a  UR                  5       nO9[	        U[        R                  5      (       a  UR                  S   O
US   U
-   S-   nUb  UR                  5       S:X  a  U$ [        R                  " X4XR                  UR                  S9nU
S:w  a  [        R                   " USS9nU[        R"                  " XR                  S	9UR%                  SS5      :  -  nUS S S S 2S S 24   R'                  U	SSS5      nUGbZ  U
S:w  GaS  UR)                  S5      UR)                  S5      :H  nS
XS:H  '   US:H  nU[*        R,                  R/                  USSS9S S 2S S24   ) -  n[        R0                  " UR3                  5       SS9S-
  n[        R4                  " UU[        R6                  " US5      5      nUR)                  S5      UR)                  S5      :H  nS
UUS:H  '   UU-  R)                  S5      R9                  UR                  [        R:                  S9nUR=                  5       nUS S 2S S 2S S 2S U
24   R?                  US5      US S 2S S 2S S 2S U
24'   Ub  UR=                  5       nUR                  S   nUS S 2S S 2S S 2S U24   US S 2S S S S 24   R9                  UR                  5      -   nUS:H  nUS S 2S S 2S S 2S U24   R?                  UU5      US S 2S S 2S S 2S U24'   U$ )Nr      r   r   r   r   )
fill_valuer~   r   r   r   F)r   r   )valuer   r   r   ) r   rU   r   r   r]   r   rv   r   r~   r   r   get_max_cache_shaper   r   fullr   triur   r   expandr+  r  
functionalpadcumsumre   r   	full_liker|   r   clonemasked_fill)r6   r   token_type_idsr  r   input_tensorrf  using_static_cacher   inputs_lead_dimsequence_lengthtarget_lengthr6  token_type_maskis_imagenew_image_startimage_group_idssame_image_mask
image_maskmask_lengthpadding_masks                        r>   r,  Gemma3Model._update_causal_mask  s    ;;""77;NN!!%.*<*<*>!*C "!'EKK

+//	+7+=+=bq+A(+??AM55+??AM nell;; $$R(#A&81<  %.*<*<*>!*C!!jj,**]k]r]r

 a**[1=Ku||M:O:OPSaSiSijlnoSppp!$a"23::?ArSUV %/Q*>,66q9^=U=UVW=XXO38Oa/0 &*H&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO-77:o>W>WXY>ZZO5:OOr12)O;FFqILL[M_M_glgqgqLrJ%++-K5@AqJZ?JZAZ5[5g5gC6K1a!1/!112 %%++-K(..r2K 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r@   Nry   r   r   r  rv  r   r  labelsr   r   r   return_dictc                 D   US L US L-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUS L=(       a    U	S LnUbR  U R                  R
                  U R                  :  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUGbx  U R                  U5      nUcY  XR                  5       " [        R                  " U R                  R
                  [        R                   UR                  S95      :H  nOQXR                  R
                  :H  R#                  S5      nUR%                  U5      R'                  UR                  5      n[)        5       (       ds  UU   R+                  5       UR+                  5       :w  aN  UR-                  SS9R-                  SS9S   n[        SU S	UR                  S   UR                  S   -   S
35      eUR'                  UR                  UR.                  5      nUR1                  UU5      nU R3                  X6XWX5      nU R4                  " SUUUUU
UUSUS.	UD6n[7        UR8                  U
(       a  UR:                  OS UR<                  UR>                  Ub  WS9$ S S9$ )Nr#  r   r   r   )r~   r   r   rk  zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r  r  r   r   r   r  r   )r&  r  r   r'  image_hidden_statesrZ   ) r(  r   r   r   use_return_dictrR   r7   rt  get_input_embeddingsr*  rv   r   r   r   rd  rw   longr+  	expand_asr|   r   numelsumr~   masked_scatterr,  r?  rh   r&  r  r   r'  )r6   ry   r_  r   r   r  rv  r   r  r  r   r   r   r  	lm_kwargsrf  special_image_maskllm_input_idsr5  rc  image_tokens_in_textr6  r   s                          r>   r{   Gemma3Model.forward.  sN   & -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$D0GV45G  T[[%?%?4??%R!*kk.H.H!H%OO-M01M,-%M  557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN %26O6O6QLL!;!;5::VcVjVjk7 &" '0;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i"+---@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M..O]
 %% 
&%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r@   rZ   )F)NNNNNNNNNNNNN)rB   rC   rD   rE   rv   r   rd  r   r,  r   r   r   r   r	   r   r   r   r
   rh   r{   rH   rZ   r@   r>   r^  r^    s   u||  * "N N`  '+*.1537KO595959-1$(,0/3&*Y
##Y
 ''Y
 !.	Y

 u//0Y
 "%U->->(?(F"GHY
 !!1!12Y
 !!1!12Y
   1 12Y
 ))*Y
 D>Y
 $D>Y
 'tnY
 d^Y
  
u//	0!Y
  Y
r@   r^  c            "         ^  \ rS rSr\              SS\R                  S\R                  S\\R                     S\\R                     S\\
\\R                     \4      S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\
\\R                  4   S\
\\4   4S jj5       r          SU 4S jjrSrU =r$ )Gemma3ForConditionalGenerationi  ry   r_  r   r   r  rv  r   r  r  r   r   r   r  logits_to_keepr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)ry   r_  rv  r   r   r  r  r   r  r   r   r  r   r   .r   r   )losslogitsr  r   r'  r  rZ   )r   r   r   r  rA  r]   re   slicelm_headrf   r   r|   r   r   r  CrossEntropyLossr   rU   r7   rl   r  r   r'  r  )r6   ry   r_  r   r   r  rv  r   r  r  r   r   r   r  r  r  r   r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputs                               r>   r{   &Gemma3ForConditionalGeneration.forward  s~   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%))%+'/!5#)
 
"  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r@   c                   > [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   US L=(       a    US LnUS   S:X  a>  [        U[        5      (       a)  Ub  UOUnU R                  R                  XxX$X5      nUUS'   U$ )N)r  r  r   r   r   r   r  rv  r   r_  r   )r1   prepare_inputs_for_generationr]   r   rA  r,  )r6   ry   r  r  r   r   r_  r   rv  r   r  r  r_   model_inputsrf  rw  r6  r=   s                    r>   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s      w<
+')%)))
 
 !!+7($D0GV45G!!j+&N&N,9,E=9L**88Q]K .9L)*r@   rZ   )NNNNNNNNNNNNNr   )
NNNNNNNTNN)rB   rC   rD   rE   r   rv   r   r   r	   r   r   r   r   r   re   r
   rl   r{   r  rH   rI   rJ   s   @r>   r  r    s    '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B ) )r@   r  )rL   r,   r   r  r>  r  r^  )Sr  collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r	   r
   r   rv   torch.nnr  torch.utils.checkpointcache_utilsr   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r   r    r!   r"   r#   r$   r%   paligemma.modeling_paligemmar&   r'   r(   r)   siglipr*   
get_loggerrB   r[   r,   rL   rh   rl   r
  rn   r   r   r   r   Moduler   GEMMA3_START_DOCSTRINGr   r  r>  r  r^  r  __all__rZ   r@   r>   <module>r     s     $ !  : :    : : 3 B 7 9 5 & X X 0 6
 
 
  ( 
		H	%N%| N%b[## [#| 	 < 	 	 	#B 	 	
SBLL 
S!	 !
M 
!1 !J)o J)Z[ [|  ;1 ;4F
k F
R-) -!@		 !@Hz
. z
zi%F iXr@   