o
    Zh                     @   sb  d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) e$ rd dl*m+Z+ ddl,m-Z- e%.e/Z0G dd de	j1Z2G dd de	j1Z3dd Z4d>ddZ5dej6de7dej6fddZ8	 		d?d!e	j1d"ej6d#ej6d$ej6d%eej6 d&e9d'ee9 d(ee9 deej6ej6f fd)d*Z:G d+d, d,e	j1Z;G d-d. d.e	j1Z<G d/d0 d0e	j1Z=e"G d1d2 d2eZ>e"G d3d4 d4e>Z?e"G d5d6 d6e>eZ@e"d7d8G d9d: d:e>ZAe"G d;d< d<e>ZBg d=ZCdS )@    )partial)CallableOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )Gemma2Config)	BlockMask)make_flex_block_causal_maskc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )Gemma2RMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r#   nn	ParametertorchZzerosweight)selfr"   r#   	__class__ Y/var/www/auris/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.pyr&   8   s   
zGemma2RMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)Zkeepdim)r)   Zrsqrtpowmeanr#   )r+   xr.   r.   r/   _norm=   s   $zGemma2RMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)r5   floatr*   Ztype_as)r+   r4   outputr.   r.   r/   forward@   s   
zGemma2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler*   shaper#   r+   r.   r.   r/   
extra_reprG   s   zGemma2RMSNorm.extra_repr)r!   )
__name__
__module____qualname__intr7   r&   r5   r9   r=   __classcell__r.   r.   r,   r/   r    7   s
    r    c                       s$   e Zd Z fddZdd Z  ZS )	Gemma2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r%   r&   confighidden_sizeZintermediate_sizer'   Linear	gate_projup_proj	down_projr   Zhidden_activationact_fnr+   rG   r,   r.   r/   r&   L   s   
zGemma2MLP.__init__c                 C   s$   |  | | || | }|S r$   )rL   rM   rJ   rK   )r+   r4   rL   r.   r.   r/   r9   V   s    zGemma2MLP.forward)r>   r?   r@   r&   r9   rB   r.   r.   r,   r/   rC   K   s    
rC   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr1   r0   r"   )r;   r)   cat)r4   x1Zx2r.   r.   r/   rotate_half[   s   rR   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerR   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr.   r.   r/   apply_rotary_pos_embb   s
   

rY   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r;   expandreshape)rZ   r[   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kv}   s
   0rc           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r0   r   r1   )r"   dtype)ptrainingr   )rb   rc   num_key_value_groupsr)   matmul	transposetanhr;   r'   Z
functionalZsoftmaxZfloat32toro   rj   rq   
contiguous)re   rf   rg   rh   ri   rj   rk   rl   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr.   r.   r/   eager_attention_forward   s"   

&r~   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	e
ej d
e
e de
ej dee de	eje
ej e
e	ej  f fddZ  ZS )Gemma2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrG   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _t|d sx|j| _d S d | _d S )Nrb   rm   TrE   r0   )r%   r&   rG   r   getattrrH   Znum_attention_headsrb   r`   rr   Zquery_pre_attn_scalarrk   attention_dropoutZ	is_causalr'   rI   Zattention_biasq_projk_projv_projo_projattn_logit_softcappingboolsliding_windowr+   rG   r   r,   r.   r/   r&      s,   


"zGemma2Attention.__init__NrZ   position_embeddingsri   past_key_valuecache_positionrx   r\   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||| jd}|	|
|| j
|\}
}|d ur| jjdkr|j d }|
d d d d d |d d f |d d d d d |d d f }
}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| |	|
||f| jr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr1   r   r0   )rW   rV   r   r   flash_attention_2eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rd   )rj   rk   r   rl   )r;   rb   r   viewrt   r   r   rY   r   updater   rG   _attn_implementationr~   getloggerwarning_oncer   rq   r   rk   r   r^   rw   r   )r+   rZ   r   ri   r   r   rx   Zinput_shapeZhidden_shapeZquery_statesry   rz   rV   rW   Zcache_kwargsseq_lenZattention_interfacer}   r{   r.   r.   r/   r9      sR   	
B


zGemma2Attention.forward)NN)r>   r?   r@   __doc__r   rA   r&   r)   Tensorr   r   r	   
LongTensorr   r   r9   rB   r.   r.   r,   r/   r      s(    r   c                       s   e Zd Zdedef fddZeddd								dd
ejde	ejejf de
ej de
ej de
e de
e de
e de
ej de	eje
e	ejejf  f fddZ  ZS )Gemma2DecoderLayerrG   r   c                    s   t    |j| _|| _t|d  | _t||d| _t|| _	t
|j|jd| _t
|j|jd| _t
|j|jd| _t
|j|jd| _|j| _d S )Nr0   )rG   r   r#   )r%   r&   rH   rG   r   
is_slidingr   	self_attnrC   mlpr    rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r,   r.   r/   r&     s   

zGemma2DecoderLayer.__init__Zlast_cache_positionz4.53.0)versionNFrZ   r   ri   rX   r   r   	use_cacher   r\   c	                 K   sd  | j rn|d urnt|jd | j}
| jjdkr"|d d |
 d f }nLt|jj	}tj
tj|tjd| j d}t|||}|d |
 d }tj|dd}tjt	|
|jd |jd}||7 }|d d d d d d |f }|}| |}| jd
||||||||d	|	\}}| |}|| }|}| |}| |}| |}|| }|f}|r||f7 }|S )Nr   r   ro   Zdiagonalr1   r   )mindevice)rZ   r   ri   rX   r   r   r   r   r.   )r   maxr;   r   rG   r   r)   finforo   r   ZtrilZ	ones_liker   whereclamparanger   r   r   r   r   r   r   )r+   rZ   r   ri   rX   r   r   r   r   rx   Zeffective_seq_len	min_dtypeZsliding_window_maskoffsetZmask_indexesZresidualZself_attn_weightsoutputsr.   r.   r/   r9     sR   
	





zGemma2DecoderLayer.forward)NNNFFN)r>   r?   r@   r   rA   r&   r   r)   r   r   r   r   r	   r   FloatTensorr9   rB   r.   r.   r,   r/   r     s8    
	r   c                       s8   e Zd Zddef fddZe edd Z  Z	S )Gemma2RotaryEmbeddingNrG   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r%   r&   hasattrr   r   r   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrG   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r+   rG   r   r   r,   r.   r/   r&   _  s   
zGemma2RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r1   r   ZmpscpuF)device_typeenabledr0   rO   r   )r   r7   r]   r;   rv   r   
isinstancer   strr)   Zautocastrt   rP   rV   r   rW   ro   )
r+   r4   rX   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrV   rW   r.   r.   r/   r9   p  s   0&zGemma2RotaryEmbedding.forwardr$   )
r>   r?   r@   r   r&   r)   no_gradr   r9   rB   r.   r.   r,   r/   r   ^  s
    r   c                   @   sH   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdd ZdS )Gemma2PreTrainedModelmodelTr   past_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )Nrd   )r3   stdr6   )rG   Zinitializer_ranger   r'   rI   r*   dataZnormal_rF   Zzero_	Embeddingpadding_idxr    Zfill_)r+   re   r   r.   r.   r/   _init_weights  s   


z#Gemma2PreTrainedModel._init_weightsN)r>   r?   r@   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendr   r.   r.   r.   r/   r     s    r   c                       s  e Zd Zdef fddZdd Zdd Zee									d!d	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZe
 	d"d
ee
jdf de
jde
jdedef
ddZed
e
jdedede
jde
jdefdd Z  ZS )#Gemma2ModelrG   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r.   )r   ).0r   rG   r.   r/   
<listcomp>  s    z(Gemma2Model.__init__.<locals>.<listcomp>r   r   F)r%   r&   pad_token_idr   
vocab_sizer'   r   rH   embed_tokensZ
ModuleListrangenum_hidden_layerslayersr    r   normr   
rotary_embgradient_checkpointing	post_initrN   r,   r   r/   r&     s   zGemma2Model.__init__c                 C      | j S r$   r   r<   r.   r.   r/   get_input_embeddings     z Gemma2Model.get_input_embeddingsc                 C   
   || _ d S r$   r   r+   rh   r.   r.   r/   set_input_embeddings     
z Gemma2Model.set_input_embeddingsN	input_idsri   rX   r   inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr\   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|r]|d u r]| js]|j
\}}}t| j |||j| jd}|	d u ry|d uri| nd}tj|||j
d  |jd}	|d u r|	d}| |||	||}|}| ||}tj| j jd |jd	}|| }|rd
nd }|rd
nd }| jd | j j D ]A}|r||f7 }| jr| jr| t|jfi |
||||||||		}n||f|||||||	d|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)Zmax_batch_sizeZmax_cache_lenro   r   r   r   r   g      ?r   r.   )r   ri   rX   r   r   r   r   )last_hidden_stater   rZ   
attentions)rG   r   r   r   
ValueErrorr   rq   r   r   r   r;   r
   ro   r   Zget_seq_lengthr)   r   rS   _update_causal_maskr   ZtensorrH   r   r   Z_gradient_checkpointing_funcr   __call__r   r   )r+   r   ri   rX   r   r   r   r   r   r   r   
batch_sizer   _Zpast_seen_tokensr|   rZ   r   Z
normalizerZall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr.   r.   r/   r9     s   



	

zGemma2Model.forwardFr   input_tensorc              	   C   s   | j jdkr|S | j jdkrt|tjrt|}|S |j|j}}|jd }t|t	t
fr2| }	n|d ur;|jd n|jd }	| j|||	||||jd d}
|
S )Nr   Zflex_attentionr   r1   r   sequence_lengthtarget_lengthro   r   r   r   )rG   r   r   r)   r   r   ro   r   r;   r
   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)r+   ri   r   r   r   r   ro   r   r   r   r|   r.   r.   r/   r   ,  s*   

	zGemma2Model._update_causal_maskr   r   ro   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuero   r   r   r   r   r1   r   )r"   r)   r   r   fullr   Ztriur   r^   r]   cloner;   rv   Zmasked_fill)ri   r   r   ro   r   r   rx   r|   r   Zmask_lengthZpadding_maskr.   r.   r/   r   S  s,    $
6  zAGemma2Model._prepare_4d_causal_attention_mask_with_cache_position	NNNNNNNNN)F)r>   r?   r@   r   r&   r   r   r   r   r   r)   r   r   r
   r   r   r   r   r   r9   r   r   r   staticmethodrA   ro   r   rB   r.   r.   r,   r/   r     s    	
u&r   c                       s  e Zd ZdgZddiZddgdgfiZ fddZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zee											d'deej deej deej dee deej deej dee dee dee deej d eeejf d!efd"d#Z						$	d( fd%d&	Z  ZS ))Gemma2ForCausalLMzlm_head.weightlm_headZcolwise_reprZ   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rD   )
r%   r&   r   r   r   r'   rI   rH   r   r   rN   r,   r.   r/   r&     s
   
zGemma2ForCausalLM.__init__c                 C      | j jS r$   r   r   r<   r.   r.   r/   r        z&Gemma2ForCausalLM.get_input_embeddingsc                 C      || j _d S r$   r   r   r.   r.   r/   r        z&Gemma2ForCausalLM.set_input_embeddingsc                 C   r   r$   r   r<   r.   r.   r/   get_output_embeddings  r   z'Gemma2ForCausalLM.get_output_embeddingsc                 C   r   r$   r   )r+   Znew_embeddingsr.   r.   r/   set_output_embeddings  r   z'Gemma2ForCausalLM.set_output_embeddingsc                 C   r   r$   r   )r+   decoderr.   r.   r/   set_decoder  r   zGemma2ForCausalLM.set_decoderc                 C   r   r$   r  r<   r.   r.   r/   get_decoder  r   zGemma2ForCausalLM.get_decoderNr   r   ri   rX   r   r   labelsr   r   r   r   logits_to_keepr\   c                 K   s  | j r| jjdkrtd| jj d |dur|n| jj}|	dur$|	n| jj}	| jd||||||||	|
d	|}|j}t	|t
rHt| dn|}| |dd|ddf }| jjduro|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a'  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r   ri   rX   r   r   r   r   r   r   lossr   r   rZ   r   r.   )rq   rG   r   r   r   r   r   r   r   r   rA   slicer   Zfinal_logit_softcappingr)   ru   loss_functionr   r   r   rZ   r   )r+   r   ri   rX   r   r   r  r   r   r   r   r  Zloss_kwargsr   rZ   Zslice_indicesr   r
  r.   r.   r/   r9     sN   (

zGemma2ForCausalLM.forwardTc	              
      s   t  j|f|||||||d|	}
|d u r|
dd }t|trb|jdkrb| jjdksb|
d d ur@|
d j\}}}|
d j	}n|
d j\}}|
d j	}| j
j||| | jjj|||d}||
d< |
S )	N)r   ri   r   r   rX   r   r  r  r0   r   r   r   r   ri   )r%   prepare_inputs_for_generationpopr   r
   ndimrG   r   r;   r   r   r   r   r   r*   ro   )r+   r   r   ri   r   r   rX   r   r  rx   Zmodel_inputsr   r   r   r   r,   r.   r/   r     sF   	

	z/Gemma2ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN)r>   r?   r@   Z_tied_weights_keysZ_tp_planZ_pp_planr&   r   r   r  r  r  r  r   r   r   r)   r   r   r
   r   r   r   rA   r   r9   r  rB   r.   r.   r,   r/   r     sr    		
Ur   a  
    The Gemma2 Model transformer with a sequence classification head on top (linear layer).

    [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                          e Zd Z fddZdd Zdd Zee									ddee	j
 d	ee	j d
ee	j
 dee dee	j dee	j
 dee dee dee defddZ  ZS )Gemma2ForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S rD   )
r%   r&   
num_labelsr   r   r'   rI   rH   scorer   rN   r,   r.   r/   r&   F  s
   
z(Gemma2ForSequenceClassification.__init__c                 C   r   r$   r   r<   r.   r.   r/   r   O  r   z4Gemma2ForSequenceClassification.get_input_embeddingsc                 C   r   r$   r   r   r.   r.   r/   r   R  r   z4Gemma2ForSequenceClassification.set_input_embeddingsNr   ri   rX   r   r   r  r   r   r   r\   c
              
   C   s(  | j ||||||||	d}
|
j}| |}|dur|jd }n|jd }| jjdu r2|dkr2td| jjdu r;d}n1|dur`|| jjk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j|||| jd
}t|||
j|
j|
jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        ri   rX   r   r   r   r   r   Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r1   )r   ro   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r   r  pooled_logitsrG   r	  )r   r   r  r;   rG   r   r   rv   r   r)   Zint32r   Zargmaxr   r   r-   r>   r  r   r   rZ   r   )r+   r   ri   rX   r   r   r  r   r   r   Ztransformer_outputsrZ   r   r   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesr  r
  r.   r.   r/   r9   U  sL   


z'Gemma2ForSequenceClassification.forwardr   )r>   r?   r@   r&   r   r   r   r   r   r)   r   r   r	   r   r   r   r9   rB   r.   r.   r,   r/   r  7  sH    		
r  c                       r  )Gemma2ForTokenClassificationc                    s|   t  | |j| _t|| _t|dd d ur|j}nt|dd d ur'|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropouthidden_dropoutg?)r%   r&   r  r   r   r   r  r  r'   ZDropoutrj   rI   rH   r  r   )r+   rG   r  r,   r.   r/   r&     s   
z%Gemma2ForTokenClassification.__init__c                 C   r   r$   r   r<   r.   r.   r/   r     r   z1Gemma2ForTokenClassification.get_input_embeddingsc                 C   r   r$   r   r   r.   r.   r/   r     r   z1Gemma2ForTokenClassification.set_input_embeddingsNr   ri   rX   r   r   r  r   r   r   r\   c
              
   C   sd   | j ||||||||	d}
|
j}| |}| |}d}|dur(| ||| j}t|||
j|
jdS )r  r  N)r
  r   rZ   r   )	r   r   rj   r  r  rG   r   rZ   r   )r+   r   ri   rX   r   r   r  r   r   r   r   Zsequence_outputr   r
  r.   r.   r/   r9     s,   


z$Gemma2ForTokenClassification.forwardr   )r>   r?   r@   r&   r   r   r   r   r   r)   r   r   r	   r   r   r   r9   rB   r.   r.   r,   r/   r    sH    	
r  )r   r   r   r  r  )Nr   )rd   NN)D	functoolsr   typingr   r   r   r   r)   Ztorch.nnr'   Zactivationsr   Zcache_utilsr	   r
   r   Z
generationr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zconfiguration_gemma2r   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr>   r   Moduler    rC   rR   rY   r   rA   rc   r7   r~   r   r   r   r   r   r   r  r  __all__r.   r.   r.   r/   <module>   s   



#ZX" n ,VF