o
    Zh                     @   sn  d dl mZmZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, e) rd dl-m.Z. ddl/m0Z0 e*1e2Z3G dd dej4Z5dd Z6dej7de8dej7fddZ9	dBdej4d ej7d!ej7d"ej7d#eej7 d$e:d%e:fd&d'Z;dCd(d)Z<G d*d+ d+ej4Z=ed,G d-d. d.ej4Z>G d/d0 d0eZ?e'G d1d2 d2e"Z@G d3d4 d4ej4ZAe'G d5d6 d6e@ZBG d7d8 d8ee&ZCe'G d9d: d:e@eZDe'd;d<G d=d> d>e@ZEe'G d?d@ d@e@ZFg dAZGdS )D    )CallableOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCacheSlidingWindowCacheStaticCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )
Phi3Config)	BlockMask)make_flex_block_causal_maskc                       s2   e Zd Z fddZdejdejfddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )N   Fbias)super__init__configr   Linearhidden_sizeZintermediate_sizegate_up_proj	down_projr   Z
hidden_actactivation_fnselfr*   	__class__ U/var/www/auris/lib/python3.10/site-packages/transformers/models/phi3/modeling_phi3.pyr)   :   s
   
zPhi3MLP.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr%   dim)r-   chunkr/   r.   )r1   r6   Z	up_statesZgater4   r4   r5   forwardB   s   

zPhi3MLP.forward)__name__
__module____qualname__r)   torchFloatTensorr<   __classcell__r4   r4   r2   r5   r$   9   s    r$   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr8   r%   r9   )shaper@   cat)xx1Zx2r4   r4   r5   rotate_halfK   s   rG   r6   n_repr7   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rC   expandreshape)r6   rH   batchnum_key_value_headsslenhead_dimr4   r4   r5   	repeat_kvR   s
   0rO           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr%   r   r8   )r:   dtype)ptrainingr    )rO   num_key_value_groupsr@   matmul	transposerC   r   Z
functionalZsoftmaxfloat32torY   rW   r[   
contiguous)rQ   rR   rS   rT   rU   rV   rW   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr4   r4   r5   eager_attention_forward^   s   
&rh   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }	}
tj|| t||  |gdd}tj|	| t|	|  |
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r8   .Nr9   )	unsqueezerC   r@   rD   rG   )qkcossinposition_idsZunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr4   r4   r5   apply_rotary_pos_embx   s   


""""ro   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr*   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )NrN   g      Tr%   Fr&   )r(   r)   r*   rq   getattrr,   num_attention_headsrN   rL   r\   rV   attention_dropoutZ	is_causalr   r+   o_projqkv_proj)r1   r*   rq   Zop_sizer2   r4   r5   r)      s   
zPhi3Attention.__init__r6   position_embeddingsrU   past_key_valuecache_positionrb   r7   c                 K   s  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur~|||d}|	||| j
|\}}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| ||||f| jsdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )Nr8   .r    r%   )rm   rl   ry   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rP   sliding_window)rW   rV   r}   )rC   rN   rv   r*   rs   rL   viewr^   ro   updaterq   rh   _attn_implementationgetloggerwarning_oncer   r[   rt   rV   rr   rJ   ra   ru   )r1   r6   rw   rU   rx   ry   rb   Zinput_shapeZhidden_shapeZqkvZ	query_posZquery_statesrc   rd   rl   rm   Zcache_kwargsZattention_interfacerg   re   r4   r4   r5   r<      sL   	
	

zPhi3Attention.forwardN)NN)r=   r>   r?   __doc__r!   r   intr)   r@   Tensorr   r	   
LongTensorr   r   r<   rB   r4   r4   r2   r5   rp      s(    rp   ZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	Phi3RMSNormư>c                    s&   t    tt|| _|| _dS )z:
        Phi3RMSNorm is equivalent to T5LayerNorm
        N)r(   r)   r   	Parameterr@   Zonesweightvariance_epsilon)r1   r,   epsr2   r4   r5   r)      s   

zPhi3RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr%   r8   T)Zkeepdim)	rY   r`   r@   r_   powmeanZrsqrtr   r   )r1   r6   Zinput_dtypeZvariancer4   r4   r5   r<      s
   zPhi3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   rC   r   r1   r4   r4   r5   
extra_repr   s   zPhi3RMSNorm.extra_repr)r   )r=   r>   r?   r)   r<   r   rB   r4   r4   r2   r5   r      s    r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )Phi3DecoderLayerr*   rq   c                    st   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
|| _t|j| _t|j| _d S )N)r*   rq   r   )r(   r)   r,   rp   	self_attnr$   mlpr   rms_norm_epsinput_layernormpost_attention_layernormr*   r   DropoutZresid_pdropresid_attn_dropoutresid_mlp_dropout)r1   r*   rq   r2   r4   r5   r)      s   

zPhi3DecoderLayer.__init__NFr6   rU   rn   rx   r|   	use_cachery   rw   rb   r7   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
| | }|}
| |}| |}|
| | }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
            past_key_value (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r6   rU   rn   rx   r|   r   ry   rw   Nr4   )r   r   r   r   r   r   )r1   r6   rU   rn   rx   r|   r   ry   rw   rb   ZresidualZself_attn_weightsoutputsr4   r4   r5   r<     s.   "
	



zPhi3DecoderLayer.forward)NNNFFNN)r=   r>   r?   r!   r   r)   r@   r   r   r   r	   boolr   r   r   rA   r<   rB   r4   r4   r2   r5   r      s<    	
r   c                   @   sL   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdZdd ZdS )	Phi3PreTrainedModelmodelTr   past_key_valuesz0.0.5c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )NrP   )r   stdg      ?)r*   Zinitializer_range
isinstancer   r+   r   dataZnormal_r'   Zzero_	Embeddingpadding_idxr   Zfill_)r1   rQ   r   r4   r4   r5   _init_weightsT  s   


z!Phi3PreTrainedModel._init_weightsN)r=   r>   r?   r!   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backend_versionr   r4   r4   r4   r5   r   D  s    r   c                       s8   e Zd Zddef fddZe edd Z  Z	S )Phi3RotaryEmbeddingNr*   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r(   r)   hasattrr   r   r   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr*   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r1   r*   devicer   r2   r4   r5   r)   c  s   
zPhi3RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r8   r    ZmpscpuF)device_typeenabledr%   r9   )rY   )r   floatrI   rC   r`   r   r   r   strr@   Zautocastr^   rD   rl   r   rm   rY   )
r1   rE   rn   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrl   rm   r4   r4   r5   r<   t  s   0&zPhi3RotaryEmbedding.forwardr   )
r=   r>   r?   r!   r)   r@   Zno_gradr   r<   rB   r4   r4   r2   r5   r   b  s
    r   c                       s  e Zd Zdef fddZdd Zdd Zee									d!d	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZ	d"d
ee
jdf de
jde
jdedef
ddZed
e
jdedede
jde
jdededefdd Z  ZS )#	Phi3Modelr*   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r4   )r   ).0rq   r*   r4   r5   
<listcomp>  s    z&Phi3Model.__init__.<locals>.<listcomp>r   r   F)r(   r)   pad_token_idr   
vocab_sizer   r   r,   embed_tokensZ
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initr0   r2   r   r5   r)     s   zPhi3Model.__init__c                 C      | j S r   r   r   r4   r4   r5   get_input_embeddings     zPhi3Model.get_input_embeddingsc                 C   
   || _ d S r   r   r1   rT   r4   r4   r5   set_input_embeddings     
zPhi3Model.set_input_embeddingsN	input_idsrU   rn   r   inputs_embedsr   r|   output_hidden_statesry   flash_attn_kwargsr7   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}t	|t
d tfsFtd|d u rO| |}|rX|d u rXt }|	d u rt|d urd| nd}tj|||jd  |jd}	|d u r}|	d}| |||	||}|}| ||}|rdnd }|rdnd }| jd | j j D ]&}|r||f7 }||f||||||	|d	|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r    r   r4   )rU   rn   rx   r|   r   ry   rw   )last_hidden_stater   r6   
attentions)r*   r|   r   r   
ValueErrorr   r[   r   r   r   r   r	   r   r
   get_seq_lengthr@   arangerC   r   ri   _update_causal_maskr   r   r   r   r   )r1   r   rU   rn   r   r   r   r|   r   ry   r   past_seen_tokensrf   r6   rw   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr4   r4   r5   r<     sx   



	


zPhi3Model.forwardFr"   input_tensorc              
   C   s  | j jdkr2|d ur&|d ur&|d d df   | d k}|r&td|d ur0d|v r0|S d S | j jdkrDt|tjrBt	|}|S |d urL|
 nd}t|t}t|t}	| j jdkrs|ss|	ss|sstj|||| j j| jdrsd S |j}
t|
j}|jd	 }|	s|r| }nt|tjr|jd n|| d	 }| j||||
||jd | j |d
}| j jdkr|d ur|jjdv r|st||}|S )NZflash_attention_2r8   r   zYou are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. rP   Zflex_attentionr{   )r   Zpast_key_values_lengthr}   Zis_trainingr    )sequence_lengthtarget_lengthrY   ry   
batch_sizer*   r   )cudaZxpuZnpu)r*   r   sumitemsizer   r   r@   r   r#   r   r   r   r   Z_ignore_causal_mask_sdpar}   r[   rY   finfominrC   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   Z_unmask_unattended)r1   rU   r   ry   r   r|   Zis_padding_rightr   Zusing_static_cacheZusing_sliding_window_cacherY   	min_dtyper   r   rf   r4   r4   r5   r     st   $





zPhi3Model._update_causal_maskr   r   rY   r   c                 C   s  | dur|   dkr| }|S t|j}	tj||f|	||jd}tj||jd|ddk}
| }t	|ddr\|j
dur\t|trF||kr\tj||jd|dd|j
 k}|
| ||
9 }|ddddddf |ddd}| dur| }| jd |kr| ddd|f } | jd }|ddddddd|f | ddddddf |j }|d	k}|ddddddd|f ||	|ddddddd|f< |S )
a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`Phi3Config`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        N   )Z
fill_valuerY   r   r   r8   r    Zuse_sliding_windowTr   )r:   r@   r   r   fullr   r   rJ   Zget_text_configrr   r}   r   r   Zbitwise_or_rI   clonerC   r`   Zmasked_fill)rU   r   r   rY   ry   r   r*   r   rf   r   Zdiagonal_attend_maskZtext_configZsliding_attend_maskZmask_lengthZpadding_maskr4   r4   r5   r   R  s@   ! 
$
6  z?Phi3Model._prepare_4d_causal_attention_mask_with_cache_position	NNNNNNNNN)F)r=   r>   r?   r!   r)   r   r   r   r   r   r@   r   r   r	   rA   r   r   r   r   r<   r   r   staticmethodr   rY   r   rB   r4   r4   r2   r5   r     s    	
d
Vr   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r=   r>   r?   r4   r4   r4   r5   r     s    r   c                       s&  e Zd ZdgZddiZddgdgfiZ fddZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zee											d(deej deej deej dee deej deej dee dee dee deej d eeejf d!ee d"efd#d$Z						%	d) fd&d'	Z  ZS )*Phi3ForCausalLMzlm_head.weightlm_headZcolwise_repr6   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S NFr&   )
r(   r)   r   r   r   r   r+   r,   r   r   r0   r2   r4   r5   r)     s
   
zPhi3ForCausalLM.__init__c                 C      | j jS r   r   r   r   r4   r4   r5   r        z$Phi3ForCausalLM.get_input_embeddingsc                 C      || j _d S r   r   r   r4   r4   r5   r        z$Phi3ForCausalLM.set_input_embeddingsc                 C   r   r   r   r   r4   r4   r5   get_output_embeddings  r   z%Phi3ForCausalLM.get_output_embeddingsc                 C   r   r   r   )r1   Znew_embeddingsr4   r4   r5   set_output_embeddings  r   z%Phi3ForCausalLM.set_output_embeddingsc                 C   r   r   r   )r1   decoderr4   r4   r5   set_decoder  r   zPhi3ForCausalLM.set_decoderc                 C   r   r   r   r   r4   r4   r5   get_decoder  r   zPhi3ForCausalLM.get_decoderNr   r   rU   rn   r   r   labelsr   r|   r   ry   logits_to_keeprb   r7   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durX| j	d||| j j
d|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Phi3ForCausalLM

        >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   rU   rn   r   r   r   r|   r   ry   )r   r   r   lossr   r   r6   r   r4   )r*   r|   r   r   r   r   r   slicer   loss_functionr   r   r   r6   r   )r1   r   rU   rn   r   r   r   r   r|   r   ry   r   rb   r   r6   Zslice_indicesr   r  r4   r4   r5   r<     s:   '
zPhi3ForCausalLM.forwardTc	                    sb   |r| j jr|jd | j jd kr|d }
|
| j jkrd }t jd||||||||d|	}|S )Nr    r   )r   r   rU   r   ry   rn   r   r   r4   )r*   r   rC   Z original_max_position_embeddingsr(   prepare_inputs_for_generation)r1   r   r   rU   r   ry   rn   r   r   rb   Zpast_lengthZmodel_inputsr2   r4   r5   r    s*   	z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN)r=   r>   r?   Z_tied_weights_keysZ_tp_planZ_pp_planr)   r   r   r   r   r   r   r   r   r   r@   r   r   r	   rA   r   r   r   r   r   r   r<   r  rB   r4   r4   r2   r5   r     sv    		
Lr   a  
    The Phi3 Model transformer with a sequence classification head on top (linear layer).

    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                          e Zd Z fddZdd Zdd Zee									ddee	j
 d	ee	j d
ee	j
 dee dee	j dee	j
 dee dee dee defddZ  ZS )Phi3ForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r(   r)   
num_labelsr   r   r   r+   r,   scorer   r0   r2   r4   r5   r)   >  s
   
z&Phi3ForSequenceClassification.__init__c                 C   r   r   r   r   r4   r4   r5   r   G  r   z2Phi3ForSequenceClassification.get_input_embeddingsc                 C   r   r   r   r   r4   r4   r5   r   J  r   z2Phi3ForSequenceClassification.set_input_embeddingsNr   rU   rn   r   r   r   r   r|   r   r7   c
              
   C   s(  | j ||||||||	d}
|
j}| |}|dur|jd }n|jd }| jjdu r2|dkr2td| jjdu r;d}n1|dur`|| jjk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j|||| jd
}t|||
j|
j|
jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rU   rn   r   r   r   r|   r   Nr   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r8   )r   rY   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r   r   pooled_logitsr*   r   )r   r   r  rC   r*   r   r   r`   r   r@   Zint32r   Zargmaxr   r   r3   r=   r  r   r   r6   r   )r1   r   rU   rn   r   r   r   r   r|   r   Ztransformer_outputsr6   r   r   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesr  r  r4   r4   r5   r<   M  sL   


z%Phi3ForSequenceClassification.forwardr   )r=   r>   r?   r)   r   r   r   r   r   r@   r   r   r	   rA   r   r   r<   rB   r4   r4   r2   r5   r  /  sH    		
r  c                       r  )Phi3ForTokenClassificationc                    s|   t  | |j| _t|| _t|dd d ur|j}nt|dd d ur'|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropouthidden_dropoutg?)r(   r)   r  r   r   rr   r  r  r   r   rW   r+   r,   r  r   )r1   r*   r  r2   r4   r5   r)     s   
z#Phi3ForTokenClassification.__init__c                 C   r   r   r   r   r4   r4   r5   r     r   z/Phi3ForTokenClassification.get_input_embeddingsc                 C   r   r   r   r   r4   r4   r5   r     r   z/Phi3ForTokenClassification.set_input_embeddingsNr   rU   rn   r   r   r   r   r|   r   r7   c
              
   C   sd   | j ||||||||	d}
|
j}| |}| |}d}|dur(| ||| j}t|||
j|
jdS )r	  r
  N)r  r   r6   r   )	r   r   rW   r  r  r*   r   r6   r   )r1   r   rU   rn   r   r   r   r   r|   r   r   Zsequence_outputr   r  r4   r4   r5   r<     s,   


z"Phi3ForTokenClassification.forwardr   )r=   r>   r?   r)   r   r   r   r   r   r@   r   r   r	   rA   r   r   r<   rB   r4   r4   r2   r5   r    sH    	
r  )r   r   r   r  r  )rP   )Nr    )Htypingr   r   r   r   r@   r   Zactivationsr   Zcache_utilsr	   r
   r   r   Z
generationr   Zintegrationsr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zconfiguration_phi3r!   Z!torch.nn.attention.flex_attentionr"   Zintegrations.flex_attentionr#   Z
get_loggerr=   r   Moduler$   rG   r   r   rO   r   rh   ro   rp   r   r   r   r   r   r   r   r  r  __all__r4   r4   r4   r5   <module>   s|   


 KL"   VF