o
    Zh/}                     @   s  d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
ZddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* e rd dl+m,Z, ddl-m.Z. e/e0Z1G dd deZ2G dd de(Z3G dd de&Z4			d1de	j5dej6dej6dej6deej6 de7dee7 d ee7 d!eej6ej6f fd"d#Z8G d$d% d%e"Z9G d&d' d'e	j5Z:G d(d) d)e'Z;G d*d+ d+e#Z<G d,d- d-e$Z=G d.d/ d/e%Z>g d0Z?dS )2    )partial)CallableOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)is_torch_flex_attn_availablelogging)deprecate_kwarg   )	GemmaAttentionGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassificationGemmaMLP
GemmaModelGemmaRMSNormapply_rotary_pos_emb	repeat_kv)	BlockMask)make_flex_block_causal_maskc                       s   e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ																							 	!d$ fd"d#	Z  Z	S )%Gemma2Configa  
    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma2-7B.
    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma2Model`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zgemma2past_key_valuesZcolwiseZrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm   	   $              gelu_pytorch_tanh    {Gz?ư>Tr      r        @F                 >@      I@hybridc                    s   t  jd||||d| || _|	| _|| _|| _|| _|| _|| _|| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappingcache_implementation)selfrC   rE   rF   rG   rH   rJ   rI   rQ   rD   rK   rL   rM   r<   r>   r=   r?   rN   rO   rP   rR   rS   rT   rU   rV   kwargs	__class__r@   X/var/www/auris/lib/python3.10/site-packages/transformers/models/gemma2/modular_gemma2.pyrB      s8   
zGemma2Config.__init__)r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   Tr   r5   r   Tr6   Fr7   r0   r8   r9   r:   r;   )
__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferenceZbase_model_tp_planZbase_model_pp_planrB   __classcell__r@   r@   rY   r[   r!   6   sR    H


r!   c                   @   s   e Zd ZdS )Gemma2RMSNormN)r\   r]   r^   r@   r@   r@   r[   ra      s    ra   c                          e Zd Z fddZ  ZS )	Gemma2MLPc                    s   t    t|j | _d S N)rA   rB   r   rQ   Zact_fnrW   configrY   r@   r[   rB      s   
zGemma2MLP.__init__r\   r]   r^   rB   r`   r@   r@   rY   r[   rc          rc   r7   modulequerykeyvaluer&   dropoutscalingsoftcapreturnc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r   r   )dimdtype)ptrainingr5   )rI   r   Znum_key_value_groupstorchmatmul	transposetanhshapennZ
functionalZsoftmaxZfloat32toru   rm   rw   
contiguous)ri   rj   rk   rl   r&   rm   rn   ro   rX   
key_statesvalue_statesattn_weightscausal_maskattn_outputr@   r@   r[   eager_attention_forward   s"   

&r   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )Gemma2Attentionrf   	layer_idxc                    sV   t  || | jj| _| jj| _d| _|jd | _t|d s&|j	| _	d S d | _	d S )NTrq   r   )
rA   rB   rf   rU   rP   Z	is_causalrR   rn   boolrS   rW   rf   r   rY   r@   r[   rB      s   

"zGemma2Attention.__init__Nr%   position_embeddingsr&   past_key_valuecache_positionrX   rp   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||| jd}|	|
|| j
|\}
}|d ur| jjdkr|j d }|
d d d d d |d d f |d d d d d |d d f }
}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| |	|
||f| jr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nrs   r5   r   )sincosr   rS   flash_attention_2eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r7   )rm   rn   rS   ro   )r|   rI   Zq_projviewrz   Zk_projZv_projr   rS   updater   rf   _attn_implementationr   getloggerwarning_oncer   rw   rP   rn   rU   Zreshaper   Zo_proj)rW   r%   r   r&   r   r   rX   Zinput_shapeZhidden_shapeZquery_statesr   r   r   r   Zcache_kwargsseq_lenZattention_interfacer   r   r@   r@   r[   forward   sR   	
B


zGemma2Attention.forward)NN)r\   r]   r^   r!   intrB   rx   Tensorr   r   r	   
LongTensorr   r   r   r`   r@   r@   rY   r[   r      s&    r   c                       s   e Zd Zdedef fddZeddd								dd
ejde	ejejf de
ej de
ej de
e de
e de
e de
ej de	eje
e	ejejf  f fddZ  ZS )Gemma2DecoderLayerrf   r   c                    s   t    |j| _|| _t|d  | _t||d| _t|| _	t
|j|jd| _t
|j|jd| _t
|j|jd| _t
|j|jd| _|j| _d S )Nr   )rf   r   )Zeps)rA   rB   rE   rf   r   
is_slidingr   	self_attnrc   mlpra   rL   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormrS   r   rY   r@   r[   rB   >  s   

zGemma2DecoderLayer.__init__Zlast_cache_positionz4.53.0)versionNFr%   r   r&   position_idsr   r   rM   r   rp   c	                 K   sd  | j rn|d urnt|jd | j}
| jjdkr"|d d |
 d f }nLt|jj	}tj
tj|tjd| j d}t|||}|d |
 d }tj|dd}tjt	|
|jd |jd}||7 }|d d d d d d |f }|}| |}| jd
||||||||d	|	\}}| |}|| }|}| |}| |}| |}|| }|f}|r||f7 }|S )Nr   r   ru   )Zdiagonalrs   r5   )mindevice)r%   r   r&   r   r   r   rM   r   r@   )r   maxr|   rS   rf   r   rx   Zfinforu   r   ZtrilZ	ones_liker   whereclamparanger   r   r   r   r   r   r   )rW   r%   r   r&   r   r   r   rM   r   rX   Zeffective_seq_lenZ	min_dtypeZsliding_window_maskoffsetZmask_indexesZresidualZself_attn_weightsoutputsr@   r@   r[   r   L  sR   
	





zGemma2DecoderLayer.forward)NNNFFN)r\   r]   r^   r!   r   rB   r   rx   r   r   r   r   r	   r   FloatTensorr   r`   r@   r@   rY   r[   r   =  s8    
	r   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZe 	ddeejdf dejdejde	def
ddZ  ZS )Gemma2Modelrf   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r@   )r   ).0r   rf   r@   r[   
<listcomp>  s    z(Gemma2Model.__init__.<locals>.<listcomp>)rA   rB   r}   Z
ModuleListrangerG   r(   re   rY   r   r[   rB     s   
zGemma2Model.__init__Nr#   r&   r   r"   r$   rM   r   output_hidden_statesr   flash_attn_kwargsrp   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|r]|d u r]| js]|j
\}}}t| j |||j| jd}|	d u ry|d uri| nd}tj|||j
d  |jd}	|d u r|	d}| |||	||}|}| ||}tj| j jd |jd	}|| }|rd
nd }|rd
nd }| jd | j j D ]A}|r||f7 }| jr| jr| t|jfi |
||||||||		}n||f|||||||	d|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)Zmax_batch_sizeZmax_cache_lenru   r   r   r5   r   g      ?r   r@   )r   r&   r   r   r   rM   r   )last_hidden_stater"   r%   
attentions)rf   r   r   rM   
ValueErrorZgradient_checkpointingrw   r   r   r'   r|   r
   ru   r   Zget_seq_lengthrx   r   Z	unsqueeze_update_causal_maskZ
rotary_embZtensorrE   r(   rG   Z_gradient_checkpointing_funcr   __call__r)   r   )rW   r#   r&   r   r"   r$   rM   r   r   r   r   
batch_sizer   _Zpast_seen_tokensr   r%   r   Z
normalizerZall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr@   r@   r[   r     s   



	

zGemma2Model.forwardFr   input_tensorc              	   C   s   | j jdkr|S | j jdkrt|tjrt|}|S |j|j}}|jd }t|t	t
fr2| }	n|d ur;|jd n|jd }	| j|||	||||jd d}
|
S )Nr   Zflex_attentionr5   rs   r   sequence_lengthtarget_lengthru   r   r   r   )rf   r   
isinstancerx   r   r    ru   r   r|   r
   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rW   r&   r   r   r"   r   ru   r   r   r   r   r@   r@   r[   r     s*   

	zGemma2Model._update_causal_mask)	NNNNNNNNN)F)r\   r]   r^   r!   rB   r   rx   r   r   r
   r   r   r   r   r   r   Zno_gradr   r   r`   r@   r@   rY   r[   r     s^    	

ur   c                       s   e Zd Z fddZ											ddeej deej deej dee d	eej	 d
eej dee
 dee
 dee
 deej deeejf defddZ							d fdd	Z  ZS )Gemma2ForCausalLMc                    "   t  | t|| _|   d S rd   rA   rB   r   modelZ	post_initre   rY   r@   r[   rB   :     
zGemma2ForCausalLM.__init__Nr   r#   r&   r   r"   r$   labelsrM   r   r   r   logits_to_keeprp   c                 K   s  | j r| jjdkrtd| jj d |dur|n| jj}|	dur$|	n| jj}	| jd||||||||	|
d	|}|j}t	|t
rHt| dn|}| |dd|ddf }| jjduro|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r#   r&   r   r"   r$   rM   r   r   r   )losslogitsr"   r%   r   r@   )rw   rf   r   r   r   r   r   r   r   r   r   slicelm_headrT   rx   r{   Zloss_functionrC   r   r"   r%   r   )rW   r#   r&   r   r"   r$   r   rM   r   r   r   r   Zloss_kwargsr   r%   Zslice_indicesr   r   r@   r@   r[   r   ?  sN   !

zGemma2ForCausalLM.forwardTc	              
      s   t  j|f|||||||d|	}
|d u r|
dd }t|trb|jdkrb| jjdksb|
d d ur@|
d j\}}}|
d j	}n|
d j\}}|
d j	}| j
j||| | jjj|||d}||
d< |
S )	N)r"   r&   r$   r   r   rM   r   r   r   r   r$   r#   r   r&   )rA   prepare_inputs_for_generationpopr   r
   ndimrf   r   r|   r   r   r   r   r   weightru   )rW   r#   r"   r&   r$   r   r   rM   r   rX   Zmodel_inputsr   r   r   r   rY   r@   r[   r     sF   	

	z/Gemma2ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN)r\   r]   r^   rB   r   rx   r   r   r
   r   r   r   r   r   r   r   r`   r@   r@   rY   r[   r   9  s\    	

Pr   c                       rb   )Gemma2ForSequenceClassificationc                    r   rd   r   re   rY   r@   r[   rB     r   z(Gemma2ForSequenceClassification.__init__rg   r@   r@   rY   r[   r     rh   r   c                       rb   )Gemma2ForTokenClassificationc                    r   rd   r   re   rY   r@   r[   rB     r   z%Gemma2ForTokenClassification.__init__rg   r@   r@   rY   r[   r     rh   r   )r!   r   r   ZGemma2PreTrainedModelr   r   )r7   NN)@	functoolsr   typingr   r   r   r   rx   Ztorch.nnr}   Ztorch.utils.checkpointZactivationsr   Zcache_utilsr	   r
   r   Zconfiguration_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zgemma.modeling_gemmar   r   r   r   r   r   r   r   r   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr    Z
get_loggerr\   r   r!   ra   rc   Moduler   floatr   r   r   r   r   r   r   __all__r@   r@   r@   r[   <module>   sl   ,
 

#GX % 