o
    Zh2                     @   st  d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZm Z m!Z! ddl"m#Z# e$e%Z&dZ'dZ(G dd deZ)G dd deZ*G dd dej+Z,G dd deZ-G dd deZ.G dd deZ/G dd deZ0G d d! d!eZ1G d"d# d#eZ2g d$Z3dS )%    )partial)CallableOptionalTupleN   )CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )CLIPMLP)	LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej deeje	ej e	eej  f fddZ  ZS )PhiAttentionconfig	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _| `t| j|j | _|j| _| jrotj|j|j |jdd| _tj|j|j |jdd| _d S d S )NTbias)epsZelementwise_affine)super__init__nnLinearhidden_sizeZnum_attention_headshead_dimq_projZnum_key_value_headsk_projv_projdenseZo_projintZpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r   	__class__ R/var/www/auris/lib/python3.10/site-packages/transformers/models/phi/modular_phi.pyr"   %   s    zPhiAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionreturnc                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jrB| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f }}|
dd | j
f |
d| j
d f }}t||||\}}tj||fdd}	tj||fdd}
|d ur|||d}||
|| j|\}
}t}| jjdkr| jjdkr|d	d
rtd nt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   .)dim)sincosr<   eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )Zdropoutscaling)shaper&   r'   viewZ	transposer(   r)   r-   r0   r1   r,   r   torchcatupdater   r   r   Z_attn_implementationgetloggerwarning_oncer   trainingZattention_dropoutrE   Zreshape
contiguousr*   )r3   r8   r9   r:   r;   r<   kwargsZinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesrA   r@   Z	query_rotZ
query_passZkey_rotZkey_passZcache_kwargsZattention_interfaceZattn_outputZattn_weightsr6   r6   r7   forward6   sV   	



zPhiAttention.forward)NN)__name__
__module____qualname__r   r+   r"   rH   Tensorr   r   r   
LongTensorrQ   __classcell__r6   r6   r4   r7   r   $   s"    r   c                   @      e Zd ZdS )PhiMLPNrR   rS   rT   r6   r6   r6   r7   rY   z       rY   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
ej  dee dee deej	 dee
ejejf  de
ejee
ejejf  f fddZ  ZS )PhiDecoderLayerr   r   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)r   r    )r!   r"   r   	self_attnrY   mlpr#   r.   r%   r/   input_layernormDropoutZresid_pdropresid_dropoutr2   r4   r6   r7   r"      s
   

zPhiDecoderLayer.__init__NFr8   r:   position_idsr;   rC   	use_cacher<   r9   r=   c	                 K   sr   |}
|  |}| jd||||||||d|	\}}| |}| | |}|| |
 }|f}|r7||f7 }|S )N)r8   r:   rc   r;   rC   rd   r<   r9   r6   )r`   r^   rb   r_   )r3   r8   r:   rc   r;   rC   rd   r<   r9   rP   ZresidualZattn_outputsZself_attn_weightsZfeed_forward_hidden_statesZoutputsr6   r6   r7   rQ      s*   
	


zPhiDecoderLayer.forward)NNNFFNN)rR   rS   rT   r   r+   r"   rH   rU   r   rV   r   boolFloatTensorrQ   rW   r6   r6   r4   r7   r\   ~   s8    
	r\   c                   @   rX   )PhiRotaryEmbeddingNrZ   r6   r6   r6   r7   rg      r[   rg   c                   @   s   e Zd Zdd ZdS )PhiPreTrainedModelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )NrD   )meanstdg      ?)r   Zinitializer_range
isinstancer#   r$   weightdataZnormal_r   Zzero_Z	EmbeddingZpadding_idxr.   Zfill_)r3   modulerj   r6   r6   r7   _init_weights   s   

z PhiPreTrainedModel._init_weightsN)rR   rS   rT   ro   r6   r6   r6   r7   rh      s    rh   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	 d	eej
 d
ee dee dee deej dee defddZ  ZS )PhiModelr   c                    sV   t    t fddt jD | _t j| _	tj
 j jd| _| `d S )Nc                    s   g | ]}t  |qS r6   )r\   ).0r   r   r6   r7   
<listcomp>   s    z%PhiModel.__init__.<locals>.<listcomp>r]   )r!   r"   r#   Z
ModuleListrangenum_hidden_layerslayersra   Z
embd_pdropembed_dropoutr.   r%   r/   final_layernormZnormr3   r   r4   rr   r7   r"      s   zPhiModel.__init__N	input_idsr:   rc   past_key_valuesinputs_embedsrd   rC   output_hidden_statesr<   flash_attn_kwargsr=   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rK|d u rKt
 }|	d u rg|d urW| nd}tj|||jd  |jd}	|d u rp|	d}| |||	||}| |}|}| ||}|rdnd }|rdnd }| jd | j j D ]A}|r||f7 }| jr| jr| t|jfi |
|||||||	|	}n||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer6   )r:   rc   r;   rC   rd   r<   r9   )Zlast_hidden_stater{   r8   Z
attentions)r   rC   r}   rd   
ValueErrorZgradient_checkpointingrN   rL   rM   Zembed_tokensr   Zget_seq_lengthrH   ZarangerF   r   Z	unsqueezeZ_update_causal_maskrw   Z
rotary_embrv   ru   Z_gradient_checkpointing_funcr   __call__rx   r
   )r3   rz   r:   rc   r{   r|   rd   rC   r}   r<   r~   Zpast_seen_tokensZcausal_maskr8   r9   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr6   r6   r7   rQ      s   




	


zPhiModel.forward)	NNNNNNNNN)rR   rS   rT   r   r"   r   rH   rV   rU   r   rf   re   r   r	   r
   rQ   rW   r6   r6   r4   r7   rp      sD    	
rp   c                       s   e Zd Z fddZ  ZS )PhiForCausalLMc                    s&   t  | tj|j|jdd| _d S )NTr   )r!   r"   r#   r$   r%   Z
vocab_sizeZlm_headry   r4   r6   r7   r"   6  s   zPhiForCausalLM.__init__)rR   rS   rT   r"   rW   r6   r6   r4   r7   r   5  s    r   c                   @   rX   )PhiForSequenceClassificationNrZ   r6   r6   r6   r7   r   ;  r[   r   c                   @   rX   )PhiForTokenClassificationNrZ   r6   r6   r6   r7   r   ?  r[   r   )rh   rp   r   r   r   )4	functoolsr   typingr   r   r   rH   Ztorch.nnr#   Zcache_utilsr   r   Zmodeling_flash_attention_utilsr	   Zmodeling_outputsr
   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   Zclip.modeling_clipr   Zllama.modeling_llamar   r   r   r   r   r   r   r   r   Zconfiguration_phir   Z
get_loggerrR   rL   Z_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCr   rY   Moduler\   rg   rh   rp   r   r   r   __all__r6   r6   r6   r7   <module>   s4    ,
V0s