o
    Zh:3                     @   sJ  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ eeZ dZ!dZ"G dd dej#Z$d#ddZ%G dd dej#Z&G dd deZ'G dd deZ(G dd dee(Z)G dd deZ*G d d! d!eZ+g d"Z,dS )$zPyTorch Phi-3 model.    )CallableOptionalTupleN)nn   )ACT2FN)Cache)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )Nr   FZbias)super__init__configr   Linearhidden_sizeZintermediate_sizegate_up_proj	down_projr   Z
hidden_actactivation_fn)selfr   	__class__ T/var/www/auris/lib/python3.10/site-packages/transformers/models/phi3/modular_phi3.pyr   1   s
   
zPhi3MLP.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr   dim)r   chunkr    r   )r!   r&   Z	up_statesZgater$   r$   r%   forward9   s   

zPhi3MLP.forward)__name__
__module____qualname__r   torchFloatTensorr,   __classcell__r$   r$   r"   r%   r   0   s    r   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }	}
tj|| t||  |gdd}tj|	| t|	|  |
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r(   .Nr)   )Z	unsqueezeshaper0   catr   )qkcossinposition_idsZunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr$   r$   r%   apply_rotary_pos_embB   s   


""""r:   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )Nhead_dimg      Tr   Fr   )r   r   r   r<   getattrr   num_attention_headsr=   num_key_value_headsZnum_key_value_groupsscalingattention_dropoutZ	is_causalr   r   o_projqkv_proj)r!   r   r<   Zop_sizer"   r$   r%   r   e   s   
zPhi3Attention.__init__r&   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr'   c                 K   s  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur~|||d}|	||| j
|\}}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| ||||f| jsdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )Nr(   .r   r   )r8   r7   rH   eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        sliding_window)ZdropoutrA   rL   )r3   r=   rD   r   r?   r@   viewZ	transposer:   updater<   r   Z_attn_implementationgetloggerZwarning_oncer
   ZtrainingrB   rA   r>   Zreshape
contiguousrC   )r!   r&   rE   rF   rG   rH   rI   Zinput_shapeZhidden_shapeZqkvZ	query_posZquery_statesZ
key_statesZvalue_statesr7   r8   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr$   r$   r%   r,   t   sL   	
	

zPhi3Attention.forward)N)NN)r-   r.   r/   __doc__r   r   intr   r0   Tensorr   r   
LongTensorr   r	   r,   r2   r$   r$   r"   r%   r;   b   s(    r;   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )Phi3DecoderLayerr   r<   c                    sL   t  || || _t||d| _t|| _t|j	| _
t|j	| _d S )N)r   r<   )r   r   r   r;   	self_attnr   mlpr   ZDropoutZresid_pdropresid_attn_dropoutresid_mlp_dropout)r!   r   r<   r"   r$   r%   r      s   
zPhi3DecoderLayer.__init__NFr&   rF   r9   rG   rK   	use_cacherH   rE   rI   r'   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
| | }|}
| |}| |}|
| | }|f}|r>||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
            past_key_value (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r&   rF   r9   rG   rK   r[   rH   rE   Nr$   )Zinput_layernormrW   rY   Zpost_attention_layernormrX   rZ   )r!   r&   rF   r9   rG   rK   r[   rH   rE   rI   ZresidualZself_attn_weightsZoutputsr$   r$   r%   r,      s.   "
	



zPhi3DecoderLayer.forward)NNNFFNN)r-   r.   r/   r   rS   r   r0   rT   r   rU   r   boolr   r   r	   r1   r,   r2   r$   r$   r"   r%   rV      s<    	
rV   c                   @   s   e Zd ZdZdS )Phi3PreTrainedModelz0.0.5N)r-   r.   r/   _versionr$   r$   r$   r%   r]      s    r]   c                   @   s$   e Zd Z							dddZdS )Phi3ForCausalLMNTc	                 K   sb   |r| j jr|jd | j jd kr|d }
|
| j jkrd }t jd||||||||d|	}|S )Nr   r   )	input_idspast_key_valuesrF   inputs_embedsrH   r9   r[   logits_to_keepr$   )r   Zrope_scalingr3   Z original_max_position_embeddingsr]   prepare_inputs_for_generation)r!   r`   ra   rF   rb   rH   r9   r[   rc   rI   Zpast_lengthZmodel_inputsr$   r$   r%   rd      s*   	z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNTN)r-   r.   r/   rd   r$   r$   r$   r%   r_      s    r_   c                   @      e Zd ZdS )Phi3ForSequenceClassificationNr-   r.   r/   r$   r$   r$   r%   rf   #      rf   c                   @   re   )Phi3ForTokenClassificationNrg   r$   r$   r$   r%   ri   '  rh   ri   )r]   Z	Phi3Modelr_   rf   ri   )Nr   )-rR   typingr   r   r   r0   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr   Zmodeling_flash_attention_utilsr	   Zmodeling_utilsr
   Zprocessing_utilsr   utilsr   Zmistral.modeling_mistralr   r   r   r   r   r   r   Zconfiguration_phi3r   Z
get_loggerr-   rP   Z_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCModuler   r:   r;   rV   r]   r_   rf   ri   __all__r$   r$   r$   r%   <module>   s2   $	

 KI)