o
    Zh|                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ e raddlmZ ddlmZ e e!Z"de#de#dej$fddZ%dej$dej$fddZ&dej$dej$dej$dej$fddZ'G dd dej(Z)G d d! d!ej(Z*G d"d# d#ej(Z+eG d$d% d%eZ,eG d&d' d'e,Z-ed(d)G d*d+ d+e,eZ.g d,Z/dS )-zPyTorch CodeGen model.    )OptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_masknum_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeZint64Zeinsumfloatcatsincos)r   r   Zinv_freqZsinusoid_inp r$   [/var/www/auris/lib/python3.10/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions/   s    "r&   xc                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r'   x1Zx2r$   r$   r%   rotate_every_two6   s   ""
r-   tensorr"   r#   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   Zrepeat_interleaver-   )r.   r"   r#   r$   r$   r%   apply_rotary_pos_emb>   s   &&r/   c                       s   e Zd Zd fdd	Zdd Zdd Z		ddd	Z					
	
	ddeej	 dee
 deej	 deej deej	 dee dee deej deeejeej f eeejeej eejdf f  f fddZ  ZS )CodeGenAttentionNc                    s  t    |j}t|j| _t|j| _|| _	|d u r(t
d| jj d |j| _|j| _| j| j | _| j| j | jkrMtd| j d| j dttj| jtjdt | _tj| j| jd dd	| _tj| j| jdd	| _|j| _| jp| j}t||| _d S )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__Zmax_position_embeddingsr   DropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__Zhidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr   sqrtr.   float32toZget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr&   embed_positions)selfconfigr8   Zmax_positionsZpos_embd_dimr;   r$   r%   r3   E   s0   

$zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr(   r)   )r(   )reshapeshape)rJ   r'   Zn_headZdim_headmp_numZreshapedr$   r$   r%   _split_headsc   s    &zCodeGenAttention._split_headsc                 C   s   t |jdkr|ddddd }nt |jdkr%|dddd }n
tdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr)   )lenrN   permute
contiguousr@   sizeview)rJ   r.   r>   Zattn_head_sizeZ	new_shaper$   r$   r%   _merge_headsh   s   
zCodeGenAttention._merge_headsc           	      C   s   | tj}| tj}t||dd}|d ur1|d d d d d d d |jd f }||7 }|| j }tjdd|}| |j	}| 
|}|d urQ|| }t||}||fS )Nr(   r)   r   )rC   r   rB   matmulZ	transposerN   rD   r   ZSoftmaxr   r5   )	rJ   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputr$   r$   r%   _attnu   s   	&

zCodeGenAttention._attnFhidden_states
layer_pastr]   position_idsr^   	use_cacheoutput_attentionscache_positionr   .c	                 C   s  |  |}	d}
|	|	jd d |
df }| j| j |
 }tj||dd\}}}| j|| j| j|
d}| j|| j| j|
d}| j|| j| j|
d}|dddd}| j	}|j
|j
krc||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur||| j|d	}|||j|| j|\}}| |||||\}}| || j| j}| |}| |}||f}|r@||f7 }|S )
NrR   r(   r   )rO   r   r   r   r   )r"   r#   Zpartial_rotation_sizerh   )rF   rM   rN   r?   r>   r   splitrP   rT   rI   devicerC   rH   r/   r!   updater   r8   rb   rX   rG   r7   )rJ   rc   rd   r]   re   r^   rf   rg   rh   ZqkvrO   Z	qkv_splitZ	local_dimrZ   r\   r[   rI   Zsincosr"   r#   Zk_rotZk_passZq_rotZq_passZcache_kwargsra   r_   outputsr$   r$   r%   forward   sV   

""""



zCodeGenAttention.forwardN)NNNNNNFFN)r<   
__module____qualname__r3   rP   rX   rb   r   r   FloatTensorr   
LongTensorboolr   r   Tensorrm   __classcell__r$   r$   rL   r%   r0   D   sJ    
"	
"r0   c                       s6   e Zd Z fddZdeej dejfddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S rn   )r2   r3   n_embdr   rE   fc_infc_outr   Zactivation_functionactr4   r6   dropout)rJ   Zintermediate_sizerK   r=   rL   r$   r%   r3      s   
zCodeGenMLP.__init__rc   r   c                 C   s,   |  |}| |}| |}| |}|S rn   )ry   r{   rz   r|   )rJ   rc   r$   r$   r%   rm      s
   



zCodeGenMLP.forward)	r<   rp   rq   r3   r   r   rr   rm   rv   r$   r$   rL   r%   rw      s    "
rw   c                       s   e Zd Zd fdd	Z							ddeej dee deej deej d	eej d
ee	 dee	 deej de
eej eeejeejdf f  f fddZ  ZS )CodeGenBlockNc                    sT   t    |jd ur|jnd|j }tj|j|jd| _t||| _	t
||| _d S )NrR   Zeps)r2   r3   Zn_innerrx   r   	LayerNormlayer_norm_epsilonln_1r0   attnrw   mlp)rJ   rK   r8   Z	inner_dimrL   r$   r%   r3      s
   
zCodeGenBlock.__init__Frc   rd   r]   re   r^   rf   rg   rh   r   .c	              
   C   sz   |}	|  |}| j||||||||d}
|
d }|
dd  }| |}|| |	 }|r2|f| }|S |f|dd   }|S )Nrc   rd   r]   re   r^   rf   rg   rh   r   r   )r   r   r   )rJ   rc   rd   r]   re   r^   rf   rg   rh   ZresidualZattn_outputsra   rl   Zfeed_forward_hidden_statesr$   r$   r%   rm     s*   



zCodeGenBlock.forwardrn   ro   )r<   rp   rq   r3   r   r   rr   r   rs   rt   r   r   ru   rm   rv   r$   r$   rL   r%   r}      s8    
	(
r}   c                       sF   e Zd ZeZdZdZdgZdZdZ	dZ
dZ fddZdd Z  ZS )	CodeGenPreTrainedModeltransformerTr}   past_key_valuesc                    s   t  j|i | d S rn   )r2   r3   )rJ   ZinputskwargsrL   r$   r%   r3   1  s   zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr!|jjjd| jjd |jdur|jj	  dS dS t |tj
rD|jjjd| jjd |jdurB|jj|j 	  dS dS t |tjrY|jj	  |jjd dS dS )zInitialize the weights.        )meanZstdNr   )
isinstancer   rE   weightdataZnormal_rK   Zinitializer_ranger1   Zzero_	EmbeddingZpadding_idxr   Zfill_)rJ   moduler$   r$   r%   _init_weights4  s   

z$CodeGenPreTrainedModel._init_weights)r<   rp   rq   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacher3   r   rv   r$   r$   rL   r%   r   &  s    r   c                       s<  e Zd Z fddZdd Zdd Ze												d"deej	 d	ee
eeeej  f  d
eej deej	 deej	 deej deej dee dee dee dee deej	 de
eef fddZ	d#d
e
ejdf dejdejd	edef
ddZed
ejdededejdejdefd d!Z  ZS )$CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  |d qS ))r8   )r}   ).0irK   r$   r%   
<listcomp>N  s    z)CodeGenModel.__init__.<locals>.<listcomp>r~   F)r2   r3   rx   r=   
vocab_sizer   r   wter4   Z
embd_pdropdropZ
ModuleListrangen_layerhr   r   ln_fminrH   Zn_ctxr>   gradient_checkpointing	post_initrJ   rK   rL   r   r%   r3   G  s    zCodeGenModel.__init__c                 C      | j S rn   r   rJ   r$   r$   r%   get_input_embeddingsW     z!CodeGenModel.get_input_embeddingsc                 C   
   || _ d S rn   r   rJ   Znew_embeddingsr$   r$   r%   set_input_embeddingsZ     
z!CodeGenModel.set_input_embeddingsN	input_idsr   r]   token_type_idsre   r^   inputs_embedsrf   rg   output_hidden_statesreturn_dictrh   r   c                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td| jrC| jrC|rCt	d d}|du rL| 
|}d}|rit|tsid}|du r_t }n
t|}t	d |jd }|du r|durz| nd}tj||| |jd	}|du r|d}| |||||	}| || j j}|}|dur|d
|}| 
|}|| }| |}d
||d
f}d}|	rdnd}|
rdnd}t| jD ]L\}}|
r||f }| jr| jr| |j|d|||| ||	|	}n||||||| ||	|d}|d }|du r|d }|	r"|||rdnd f }q| |}||}|
r5||f }|r:|nd}|rC|  }|sSt!dd ||||fD S t"||||dS )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rj   r(   r$   r   r   c                 s   s    | ]	}|d ur|V  qd S rn   r$   )r   vr$   r$   r%   	<genexpr>  s    z'CodeGenModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   rc   
attentions)#rK   rg   r   rf   use_return_dictr@   r   trainingr9   r:   r   r   r   r	   Zfrom_legacy_cacherN   get_seq_lengthr   r   rj   Z	unsqueeze_update_causal_maskZget_head_maskr   rW   r   rV   	enumerater   Z_gradient_checkpointing_func__call__r   Zto_legacy_cachetupler   )rJ   r   r   r]   r   re   r^   r   rf   rg   r   r   rh   r   Zreturn_legacy_cacheZ
seq_lengthpast_seen_tokensr`   rc   Ztoken_type_embedsZoutput_shapeZnext_decoder_cacheZall_self_attentionsZall_hidden_statesr   blockrl   Z
next_cacher$   r$   r%   rm   ]  s   












zCodeGenModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   Zpast_key_values_lengthZis_trainingr   r(   )sequence_lengthtarget_lengthr   rh   
batch_size)cudaZxpuZnpu)rK   Z_attn_implementationanyr   r   ru   r   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   r   rN   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrj   typefinfor   Z_unmask_unattended)rJ   r]   r   rh   r   rg   r   Zusing_compilable_cacher   r   r   r`   	min_dtyper$   r$   r%   r     sT   




z CodeGenModel._update_causal_maskr   r   r   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        NrR   )Z
fill_valuer   rj   r   )Zdiagonalr   r(   r   )r   r   r   r   fullrj   Ztriur   rM   expandclonerN   rC   Zmasked_fill)r]   r   r   r   rh   r   r   r`   r   Zmask_lengthZpadding_maskr$   r$   r%   r   3  s,    $
6  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNN)F)r<   rp   rq   r3   r   r   r   r   r   rs   r   r   r   ru   rr   rt   r   rm   r   staticmethodintr   r   rv   r$   r$   rL   r%   r   E  s    	

 
Dr   zM
    The CodeGen Model transformer with a language modeling head on top.
    )Zcustom_introc                        s"  e Zd ZdgZ fddZdd Zdd Ze													dd	ee	j
 d
eeeeee	j  f  dee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee dee dee	j
 deeef fddZed
eee	j  de	jdeee	j  fddZ  ZS )CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S rn   )
r2   r3   r   r   r   rE   rx   r   lm_headr   r   rL   r$   r%   r3   t  s   
zCodeGenForCausalLM.__init__c                 C   r   rn   r   r   r$   r$   r%   get_output_embeddings|  r   z(CodeGenForCausalLM.get_output_embeddingsc                 C   r   rn   r   r   r$   r$   r%   set_output_embeddings  r   z(CodeGenForCausalLM.set_output_embeddingsNr   r   r]   r   re   r^   r   labelsrf   rg   r   r   rh   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |tj}d}|durH||j}| j||fd| j j	i|}||j
}|s^|f|dd  }|dur\|f| S |S t|||j|j|jdS )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   r]   r   re   r^   r   rf   rg   r   r   rh   r   r   r   )lossZlogitsr   rc   r   )rK   r   r   r   rC   r   rB   rj   Zloss_functionr   r   r   r   rc   r   )rJ   r   r   r]   r   re   r^   r   r   rf   rg   r   r   rh   r   Ztransformer_outputsrc   Z	lm_logitsr   outputr$   r$   r%   rm     sN   zCodeGenForCausalLM.forwardbeam_idxc                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrC   rj   )r   Z
past_stater   r$   r%   r     s   " z>CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   rd   r   r$   r%   r     s
    
z4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>r   )r   r   r$   r   r%   _reorder_cache  s   	z!CodeGenForCausalLM._reorder_cache)NNNNNNNNNNNNN)r<   rp   rq   Z_tied_weights_keysr3   r   r   r   r   r   rs   r   r   r   ru   rr   rt   r   rm   r   r   rv   r$   r$   rL   r%   r   l  sp    	

Lr   )r   r   r   )0__doc__typingr   r   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr   r	   Z
generationr
   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   Zmodeling_utilsr   utilsr   r   r   Zconfiguration_codegenr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr<   r9   r   ru   r&   r-   r/   Moduler0   rw   r}   r   r   r   __all__r$   r$   r$   r%   <module>   sF   
" !.  (m