o
    Zh.                     @   sx  d Z ddlmZmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% e&e'Z(G dd de	j)Z*G dd deZ+G dd deZ,G dd de!Z-G dd de Z.G dd deZ/G dd deZ0G dd  d eZ1G d!d" d"eZ2g d#Z3dS )$zPyTorch Starcoder2 model.    )CallableListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )
MistralAttentionMistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralPreTrainedModelMistralRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Starcoder2Configc                       s@   e Zd Zdef fddZdeeej  dejfddZ	  Z
S )Starcoder2MLPconfigc                    sT   t    |j}tj||j|jd| _tj|j||jd| _t	|j
 | _|j| _d S N)bias)super__init__hidden_sizer   LinearZintermediate_sizeuse_biasc_fcc_projr	   Z
hidden_actactresidual_dropout)selfr   Z	embed_dim	__class__ `/var/www/auris/lib/python3.10/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr#   6   s   
zStarcoder2MLP.__init__hidden_statesreturnc                 C   s8   |  |}| |}| |}tjj|| j| jd}|S )Nptraining)r'   r)   r(   r   
functionaldropoutr*   r4   )r+   r0   r.   r.   r/   forward>   s
   


zStarcoder2MLP.forward)__name__
__module____qualname__r   r#   r   r   torchFloatTensorr7   __classcell__r.   r.   r,   r/   r   5   s    &r   c                       s   e Zd Zddedee f fddZ		ddejde	ejejf deej d	ee
 d
eej dee de	ejeej ee	ej  f fddZ  ZS )Starcoder2AttentionNr   	layer_idxc                    s   t    |j| _tj|j|j| j |jd| _	tj|j|j
| j |jd| _tj|j|j
| j |jd| _tj|j| j |j|jd| _d S r    )r"   r#   r*   r   r%   r$   Znum_attention_headshead_dimr&   q_projZnum_key_value_headsk_projv_projo_projr+   r   r?   r,   r.   r/   r#   G   s   
"zStarcoder2Attention.__init__r0   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr1   c                 K   sj  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkrw| jjdkrq|ddrqtd	 nt| jj }|| |	|
||f| jsd
n| j| jt| jdd d|\}}|jg |dR   }| |}tjj|| j| jd}||fS )Nr   r   )sincosrI   eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)r6   scalingrQ   r2   )shaper@   rA   viewZ	transposerB   rC   r   updater?   r   r   Z_attn_implementationgetloggerwarning_oncer   r4   Zattention_dropoutrR   getattrZreshape
contiguousrD   r   r5   r6   r*   )r+   r0   rF   rG   rH   rI   rJ   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesrM   rL   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr.   r.   r/   r7   O   sH   		


zStarcoder2Attention.forward)N)NN)r8   r9   r:   r   r   intr#   r;   Tensorr   r
   
LongTensorr   r   r7   r=   r.   r.   r,   r/   r>   F   s&    r>   c                       s&   e Zd Zdedef fddZ  ZS )Starcoder2DecoderLayerr   r?   c                    sP   t  |  t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _d S )N)r   r?   Zeps)r"   r#   r>   Z	self_attnr   Zmlpr   	LayerNormr$   norm_epsilonZinput_layernormZpost_attention_layernormrE   r,   r.   r/   r#      s
   
zStarcoder2DecoderLayer.__init__)r8   r9   r:   r   r[   r#   r=   r.   r.   r,   r/   r^      s    r^   c                   @      e Zd ZdS )Starcoder2RotaryEmbeddingNr8   r9   r:   r.   r.   r.   r/   rc          rc   c                   @   s   e Zd Zdd ZdS )Starcoder2PreTrainedModelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )NrP   )meanstdg      ?)r   Zinitializer_range
isinstancer   r%   weightdataZnormal_r!   Zzero_Z	EmbeddingZpadding_idxr`   Zfill_)r+   modulerh   r.   r.   r/   _init_weights   s   

z'Starcoder2PreTrainedModel._init_weightsN)r8   r9   r:   rm   r.   r.   r.   r/   rf      s    rf   c                       s   e Zd Zdef fddZ									ddeej deej deej dee	e
eej f  d	eej d
ee dee dee deej dee defddZ  ZS )Starcoder2Modelr   c                    sL   t    t fddt jD | _tj j j	d| _
 j| _d S )Nc                    s   g | ]}t  |qS r.   )r^   ).0r?   r   r.   r/   
<listcomp>   s    z,Starcoder2Model.__init__.<locals>.<listcomp>r_   )r"   r#   r   Z
ModuleListrangenum_hidden_layerslayersr`   r$   ra   normembedding_dropout)r+   r   r,   rp   r/   r#      s   zStarcoder2Model.__init__N	input_idsrG   position_idspast_key_valuesinputs_embeds	use_cacherO   output_hidden_statesrI   flash_attn_kwargsr1   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rK|d u rKt
 }|	d u rg|d urW| nd}tj|||jd  |jd}	|d u rp|	d}| |||	||}|}tjj|| j| jd}| ||}|rdnd }|rdnd }| jd | j j D ]&}|r||f7 }||f||||||	|d	|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer2   r.   )rG   rx   rH   rO   r{   rI   rF   )Zlast_hidden_statery   r0   Z
attentions)r   rO   r|   r{   
ValueErrorZgradient_checkpointingr4   rW   rX   Zembed_tokensr   Zget_seq_lengthr;   ZarangerS   r~   Z	unsqueezeZ_update_causal_maskr   r5   r6   rv   Z
rotary_embrt   rs   ru   r   )r+   rw   rG   rx   ry   rz   r{   rO   r|   rI   r}   Zpast_seen_tokensZcausal_maskr0   rF   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr.   r.   r/   r7      sz   




	


zStarcoder2Model.forward)	NNNNNNNNN)r8   r9   r:   r   r#   r   r;   r]   r\   r   r
   r   r<   boolr   r   r   r7   r=   r.   r.   r,   r/   rn      sD    
	
rn   c                   @   rb   )Starcoder2ForCausalLMNrd   r.   r.   r.   r/   r   
  re   r   c                   @   rb   )#Starcoder2ForSequenceClassificationNrd   r.   r.   r.   r/   r     re   r   c                   @   rb   ) Starcoder2ForTokenClassificationNrd   r.   r.   r.   r/   r     re   r   )r   rn   rf   r   r   )4__doc__typingr   r   r   r   r   r;   Ztorch.utils.checkpointr   Zactivationsr	   Zcache_utilsr
   r   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   Zmistral.modeling_mistralr   r   r   r   r   r   r   r   r   r   Zconfiguration_starcoder2r   Z
get_loggerr8   rW   Moduler   r>   r^   rc   rf   rn   r   r   r   __all__r.   r.   r.   r/   <module>   s2   0
@	g