o
    Zh                     @   sl  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlZddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; e&<e=Z>G dd de*Z?G dd deZ@eG dd de9ZAeG dd de6ZBG dd dejCZDG dd  d e.ZEG d!d" d"e1ZFG d#d$ d$e2ZGG d%d& d&e,ZHG d'd( d(ejIZJdZKG d)d* d*e0ZLG d+d, d,e/ZMG d-d. d.e-ZNG d/d0 d0ejIZOG d1d2 d2e8ZPG d3d4 d4e7ZQg d5ZRdS )6    N)Callable)	dataclass)partial)AnyDictListOptionalTupleUnion   )CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                       s6   e Zd ZdZdZ								d fd	d
	Z  ZS )Gemma3TextConfiga!  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.
    Zgemma3_text@      .AN     @      c	           
         s2   t  j| fi |	 || _|| _|| _t|  d S N)super__init__rope_local_base_freqsliding_window_patternrope_scalingr   )
self
vocab_size
rope_thetar5   r3   r4   Zmax_position_embeddingsZfinal_logit_softcappingZattn_logit_softcappingZsuper_kwargs	__class__ X/var/www/auris/lib/python3.10/site-packages/transformers/models/gemma3/modular_gemma3.pyr2      s
   zGemma3TextConfig.__init__)r+   r,   Nr-   r.   r/   NN)__name__
__module____qualname____doc__
model_typer2   __classcell__r;   r;   r9   r<   r*   ;   s    yr*   c                       s   e Zd ZdZdZddddZeedZ					
			dde	e
eeeef f  de	e
eeeef f  dededededef fddZ  ZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zgemma3image_token_indexboi_token_indexeoi_token_index)image_token_idZboi_token_idZeoi_token_id)text_configvision_configN         {Gz?rH   rI   mm_tokens_per_imageinitializer_rangec           	         s   |d u rt  }td nt|trt di |}t|tr&tdi |}n|d u r2t }td || _|| _|| _|| _	|| _
|| _|| _t jdi | d S )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.r;   )r*   loggerinfo
isinstancedictr)   rH   rI   rO   rE   rF   rD   rP   r1   r2   )	r6   rH   rI   rO   rE   rF   rD   rP   kwargsr9   r;   r<   r2     s$   


zGemma3Config.__init__)NNrJ   rK   rL   rM   rN   )r=   r>   r?   r@   rA   Zattribute_mapr*   r)   Zsub_configsr   r
   r   strr   intfloatr2   rB   r;   r;   r9   r<   rC      s@    0rC   c                   @      e Zd ZdS )Gemma3ModelOutputWithPastNr=   r>   r?   r;   r;   r;   r<   rZ   *      rZ   c                   @   rY   )Gemma3CausalLMOutputWithPastNr[   r;   r;   r;   r<   r]   /  r\   r]   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nrc   F)
persistent)r1   r2   Zregister_buffertorchtensor)r6   r`   ra   rb   rc   r9   r;   r<   r2   9  s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S r0   )r1   forwardrc   toweightdtype)r6   rg   r9   r;   r<   rh   =  s   z%Gemma3TextScaledWordEmbedding.forward)r_   )r=   r>   r?   r@   rW   rX   r2   re   Tensorrh   rB   r;   r;   r9   r<   r^   4  s     r^   c                       s"   e Zd Zdef fddZ  ZS )	Gemma3MLPconfigc                       t  | d S r0   r1   r2   r6   rn   r9   r;   r<   r2   B     zGemma3MLP.__init__r=   r>   r?   r*   r2   rB   r;   r;   r9   r<   rm   A  s    rm   c                       s(   e Zd Zddedef fddZ  ZS )Gemma3RMSNormư>dimepsc                    s   t    d S r0   rp   )r6   rv   rw   r9   r;   r<   r2   G  s   zGemma3RMSNorm.__init__)ru   )r=   r>   r?   rW   rX   r2   rB   r;   r;   r9   r<   rt   F  s     rt   c                       s$   e Zd Zddef fddZ  ZS )Gemma3RotaryEmbeddingNrn   c                    ro   r0   rp   )r6   rn   devicer9   r;   r<   r2   L  rr   zGemma3RotaryEmbedding.__init__r0   rs   r;   r;   r9   r<   rx   K  s    rx   c                       s   e Zd Zdedef fddZ		ddejdejdeej d	ee	 d
eej
 dee deejeej eeej  f fddZ  ZS )Gemma3Attentionrn   	layer_idxc                    sX   t |d |j | _t   | jr|jnd | _t|j|jd| _	t|j|jd| _
d S )N   )rv   rw   )boolr4   
is_slidingr1   r2   sliding_windowrt   head_dimrms_norm_epsq_normk_normr6   rn   r{   r9   r;   r<   r2   R  s
   
zGemma3Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionrU   returnc                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ur|||| j
d}||
|| j|\}
}|d ur| jjdkr|j d }|
d d d d d |d d f |d d d d d |d d f }
}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|d ur||	}|| |	|
||f| jr| jnd| j| j
d|\}}|jg |dR   }| |}||fS )Nr|   r   )sincosr   r   flash_attention_2eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )Zdropoutscalingr   )shaper   Zq_projview	transposeZk_projZv_projr   r   r#   r   updater{   rn   _attn_implementationr$   getrQ   warning_oncer   ri   trainingZattention_dropoutr   reshape
contiguousZo_proj)r6   r   r   r   r   r   rU   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesr   r   Zcache_kwargsseq_lenZattention_interfaceZattn_outputZattn_weightsr;   r;   r<   rh   [  sX   	


B
	

zGemma3Attention.forward)NN)r=   r>   r?   r*   rW   r2   re   rl   r   r   
LongTensorr   r   tuplerh   rB   r;   r;   r9   r<   rz   Q  s&    rz   c                       s   e Zd Zdedef fddZeddd								dd
ejdejdejde	ej de	ej
 de	e de	e de	e de	ej
 deeje	eejejf  f fddZ  ZS )Gemma3DecoderLayerrn   r{   c                    s   t    || _|j| _|| _t||d| _t|| _t	| j|j
d| _t	| j|j
d| _t	| j|j
d| _t	| j|j
d| _| jj| _|j| _d S )N)rn   r{   rw   )r1   r2   rn   hidden_sizer{   rz   	self_attnrm   mlprt   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr~   r   r   r9   r;   r<   r2     s   


zGemma3DecoderLayer.__init__Zlast_cache_positionz4.53.0)versionNFr   position_embeddings_globalposition_embeddings_localr   position_idsr   r   	use_cacher   r   c
                 K   sv  | j rn|d urnt|	jd | j}| jjdkr"|d d | d f }nLt|jj	}tj
tj|tjd| j d}t|||}|	d | d }tj|dd}tjt	||jd |jd}||7 }|d d d d d d |f }|}| |}| jj r||}n|}| jd
||||||||	d	|
\}}| |}|| }|}| |}| |}| |}|| }|f}|r||f7 }|S )Nr   r   rk   Zdiagonalr   r|   )minry   )r   r   r   r   r   r   r   r   r;   )r~   maxr   r   rn   r   re   finfork   r   ZtrilZ	ones_liker}   whereclamparangery   r   r   r   r   r   r   )r6   r   r   r   r   r   r   r   r   r   rU   Zeffective_seq_len	min_dtypeZsliding_window_maskoffsetZmask_indexesZresidualr   Zself_attn_weightsoutputsr;   r;   r<   rh     sX   
	





zGemma3DecoderLayer.forward)NNNFFN)r=   r>   r?   r*   rW   r2   r   re   rl   r   r   r   r}   r   FloatTensorrh   rB   r;   r;   r9   r<   r     s<    
	
r   c                   @   s    e Zd ZdZg dZdd ZdS )Gemma3PreTrainedModel )r   ZSiglipVisionEmbeddingsZSiglipEncoderLayerZ#SiglipMultiheadAttentionPoolingHeadc                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|trT|jjd d S t|tra|jj
  d S d S )Nr   )meanstdr_   )rn   rP   rS   nnZLinearZConv2drj   dataZnormal_ZbiasZzero_	Embeddingrb   rt   Zfill_Gemma3MultiModalProjectormm_input_projection_weight)r6   moduler   r;   r;   r<   _init_weights  s    



z#Gemma3PreTrainedModel._init_weightsN)r=   r>   r?   base_model_prefixZ_no_split_modulesr   r;   r;   r;   r<   r     s    r   c                       s   e Zd ZeZdef fddZ									ddeej deej	 deej dee
 d	eej d
ee dee dee deej dee defddZ  ZS )Gemma3TextModelrn   c                    sX   t  | t|j|j| j| jjd d| _t	|}|j
|_ddi|_t|d| _d S )N      ?)rc   Z	rope_typedefault)rn   )r1   r2   r^   r7   r   rb   rn   embed_tokenscopydeepcopyr3   r8   r5   rx   rotary_emb_localrq   r9   r;   r<   r2     s   

zGemma3TextModel.__init__Nrg   r   r   past_key_valuesinputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|r[|d u r[| js[|j
\}}}t| j |||jd}|	d u rw|d urg| nd}tj|||j
d  |jd}	|d u r|	d}| |||	||}|}| ||}| ||}|rdnd }|rdnd }| jd | j j D ]C}|r||f7 }| jr| jr| t|jfi |
|||||||||	
}n||f||||||||	d	|
}|d }|r||d f7 }q| |}|r||f7 }t||||d
S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)Zmax_batch_sizeZmax_cache_lenrk   r   r|   r   r;   )r   r   r   r   r   r   r   r   )last_hidden_stater   r   
attentions)rn   r   r   r   
ValueErrorZgradient_checkpointingr   rQ   r   r   r   r   rk   get_seq_lengthre   r   ry   	unsqueeze_update_causal_maskZ
rotary_embr   ZlayersZnum_hidden_layersZ_gradient_checkpointing_funcr   __call__Znormr   )r6   rg   r   r   r   r   r   r   r   r   r   
batch_sizer   _past_seen_tokenscausal_maskr   r   r   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr;   r;   r<   rh   +  s   

	



zGemma3TextModel.forward)	NNNNNNNNN)r=   r>   r?   r*   config_classr2   r   re   r   rl   r   r   r}   r   r   r   rh   rB   r;   r;   r9   r<   r     sF    	
r   c                       s*   e Zd ZeZdZdef fddZ  ZS )Gemma3ForCausalLMlanguage_modelrn   c                    s   t  | t|| _d S r0   )r1   r2   r   modelrq   r9   r;   r<   r2     s   zGemma3ForCausalLM.__init__)r=   r>   r?   r*   r   r   r2   rB   r;   r;   r9   r<   r     s    r   c                       s2   e Zd Zdef fddZdejfddZ  ZS )r   rn   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r   )kernel_sizeZstride)r1   r2   r   	Parameterre   ZzerosrI   r   rH   r   rt   Zlayer_norm_epsmm_soft_emb_normrW   Z
image_sizeZ
patch_sizepatches_per_imagerO   Ztokens_per_sider   Z	AvgPool2davg_poolrq   r9   r;   r<   r2     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr|   r   )r   r   r   r   r   r   flattenr   re   matmulr   Ztype_as)	r6   r   r   r   Z
seq_lengthZreshaped_vision_outputsZpooled_vision_outputsZnormed_vision_outputsZprojected_vision_outputsr;   r;   r<   rh     s   



z!Gemma3MultiModalProjector.forward)	r=   r>   r?   rC   r2   re   rl   rh   rB   r;   r;   r9   r<   r     s    r   c                !   @   s   e Zd ZdejdejfddZ	ddefddZee																											dd
ej
dejdeej deej
 deeeej ef  deej
 deej
 deej deej
 dee dee dee dee deeef fddZd	S )Gemma3Modelpixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   )Zvision_towerr   Zmulti_modal_projector)r6   r   r   image_featuresr;   r;   r<   get_image_features  s   

zGemma3Model.get_image_featuresFis_trainingc                 C   s  | j jjdkr	|S |d ur| dkr|S t|t}t| jj	}|j
d d \}	}
|r1| }nt|tr;| }nt|tjrF|j
d n|d |
 d }|d urZ| dkrZ|S tj|
|f|| j|jd}|
dkrrtj|dd}|tj||jd	|ddk9 }|d d d d d d f |	ddd}|d ur |
dkr |d|dk}d
||dk< |dk}|tjj|dddd d d df  @ }tj| ddd }t||t|d}|d|dk}d
||dk< ||@ dj|jtjd}| }|d d d d d d d |
f |d|d d d d d d d |
f< |d urt| }|j
d }|d d d d d d d |f |d d d d d d f |j }|dk}|d d d d d d d |f |||d d d d d d d |f< |S )Nr      r   r   r   r|   )Z
fill_valuerk   ry   r   r   F)r|   r   )valuerv   r   r   ) rn   rH   r   rv   rS   r   re   r   rk   r   r   Zget_max_cache_shaper   rl   fullry   Ztriur   r   expandr   r   Z
functionalpadZcumsumrW   r   Z	full_likeri   r}   cloneZmasked_fill)r6   r   token_type_idsr   r   input_tensorr   Zusing_static_cacher   Zinputs_lead_dimZsequence_lengthZtarget_lengthr   Ztoken_type_maskZis_imageZnew_image_startZimage_group_idsZsame_image_maskZ
image_maskZmask_lengthZpadding_maskr;   r;   r<   r     s^   	




 $(  

@  zGemma3Model._update_causal_maskNrg   r   r   r   r   r   r   labelsr   r   r   return_dictc                 K   s.  |d u |d uA rt d|d ur|n| jj}|d ur|n| jj}|d ur&|n| jj}|d uo1|	d u}|d urL| jj| jkrL|| jjk}| }d||< n|}|d u rX|  |}|d u rt|d urd|	 nd}t
j|||jd  |jd}|d ur| |}|d u r||  t
j| jjt
j|jdk}n|| jjkd}|||j}t s||  | kr|jddjddd }t d| d	|jd |jd   d
||j|j}|||}| ||||||}| jd|||||
||d|d	|}t|j|
r|jnd |j|j|d ur|dS d dS )Nr   r   r|   r   )rk   ry   r   r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r   r   r   r   r   r   r   )r   r   r   r   image_hidden_statesr;   ) r   rn   r   r   use_return_dictrG   r7   r   Zget_input_embeddingsr   re   r   r   ry   r   rf   longr   Z	expand_asri   r   Znumelsumrk   Zmasked_scatterr   r   rZ   r   r   r   r   )r6   rg   r   r   r   r   r   r   r   r   r   r   r   r   	lm_kwargsr   Zspecial_image_maskZllm_input_idsr   r   Zimage_tokens_in_textr   r   r;   r;   r<   rh   .  s~   


zGemma3Model.forward)F)NNNNNNNNNNNNN)r=   r>   r?   re   rl   r   r}   r   r   r   r   r   r   r
   r   r   r	   rZ   rh   r;   r;   r;   r<   r     sd    
P	

r   c                "       s   e Zd Ze														ddejdejdeej deej dee	e
ej ef  deej d	eej d
eej deej dee dee dee dee de	eejf de	eef fddZ										d fdd	Z  ZS )Gemma3ForConditionalGenerationNr   rg   r   r   r   r   r   r   r   r   r   r   r   r   logits_to_keepr   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||||d|}|d }t|trCt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|dur|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rg   r   r   r   r   r   r   r   r   r   r   r   r   r   .r   r|   )losslogitsr   r   r   r   r;   )rn   r   r   r   r   rS   rW   sliceZlm_headrX   r   ri   ry   r   r   ZCrossEntropyLossr   rH   r7   r]   r   r   r   r   )r6   rg   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zslice_indicesr  r  Zshift_logitsZshift_labelsZshift_attention_maskZloss_fctZflat_logitsZflat_labelsoutputr;   r;   r<   rh     sd   @$
z&Gemma3ForConditionalGeneration.forwardTc                    s   t  j|f||||||	|
|d|}|d dkr||d< |d uo$|d u}|d dkrGt|trG|d ur6|n|}| j||||||}||d< |S )N)r   r   r   r   r   r   r   r   r   r   r   )r1   prepare_inputs_for_generationrS   r   r   r   )r6   rg   r   r   r   r   r   r   r   r   r   r   rU   Zmodel_inputsr   r   r   r9   r;   r<   r    s0   
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNr   )
NNNNNNNTNN)r=   r>   r?   r   re   r   r   r   rl   r
   r   r   r}   rW   r	   r]   rh   r  rB   r;   r;   r9   r<   r     sv    	

 r   )rC   r*   r   r   r   r   r   )Sr   collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r   r	   r
   re   Ztorch.nnr   Ztorch.utils.checkpointZcache_utilsr   r   r   Zconfiguration_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   Zmodeling_rope_utilsr   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zgemma2.configuration_gemma2r   Zgemma2.modeling_gemma2r   r   r   r   r    r!   r"   r#   r$   Zpaligemma.modeling_paligemmar%   r&   r'   r(   Zsiglipr)   Z
get_loggerr=   rQ   r*   rC   rZ   r]   r   r^   rm   rt   rx   rz   Moduler   ZGEMMA3_START_DOCSTRINGr   r   r   r   r   r   __all__r;   r;   r;   r<   <module>   s\    ,
 ^M^ 
	$ > -