o
    Zh                     @   s$  d dl mZmZmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) e$ rd dl*m+Z+ ddl,m-Z- e%.e/Z0G dd dej1Z2G dd dej1Z3dej4de5dej4fddZ6	d9dej1dej4dej4d ej4d!eej4 d"e7d#e7fd$d%Z8d&d' Z9d:d(d)Z:G d*d+ d+ej1Z;G d,d- d-ej1Z<G d.d/ d/eZ=e"G d0d1 d1eZ>e"G d2d3 d3e>Z?G d4d5 d5ee!Z@e"G d6d7 d7e>eZAg d8ZBdS );    )CallableListOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )Cohere2Config)	BlockMask)make_flex_block_causal_maskc                       s8   e Zd Zddef fddZe edd Z  Z	S )Cohere2RotaryEmbeddingNconfigc                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)super__init__hasattrr"   getr#   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr!   r   Zrope_init_fnattention_scalingZregister_bufferr&   Zoriginal_inv_freq)selfr!   devicer&   	__class__ [/var/www/auris/lib/python3.10/site-packages/transformers/models/cohere2/modeling_cohere2.pyr)   3   s   
zCohere2RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd* | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 shw   Y  |j|jd
|	j|jd
fS )Nr   r   ZmpscpuF)device_typeenabled   dimdtype)r&   floatexpandshape
isinstancer.   r$   strtorchZautocast	transposeZrepeat_interleavecosr,   sintor;   )
r-   xposition_idsZinv_freq_expandedZposition_ids_expandedr5   ZfreqsZembrC   rD   r1   r1   r2   forwardD   s   (&zCohere2RotaryEmbedding.forwardN)
__name__
__module____qualname__r   r)   rA   no_gradr   rH   __classcell__r1   r1   r/   r2   r    2   s
    r    c                       s&   e Zd Zd fdd	Zdd Z  ZS )	Cohere2LayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)r(   r)   nn	ParameterrA   Zonesweightvariance_epsilon)r-   hidden_sizeepsbiasr/   r1   r2   r)   U   s   

zCohere2LayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )Nr3   T)Zkeepdimr7   )	r;   rE   rA   float32meanpowZrsqrtrT   rS   )r-   hidden_statesZinput_dtyperY   Zvariancer1   r1   r2   rH   [   s   
zCohere2LayerNorm.forward)NrP   FrJ   rK   rL   r)   rH   rN   r1   r1   r/   r2   rO   T   s    rO   r[   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r>   r=   reshape)r[   r]   batchnum_key_value_headsslenhead_dimr1   r1   r2   	repeat_kve   s
   0rd           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr7   r   r3   )r9   r;   )ptrainingr   )rd   num_key_value_groupsrA   matmulrB   r>   rQ   Z
functionalZsoftmaxrX   rE   r;   rl   ro   
contiguous)rf   rg   rh   ri   rj   rk   rl   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr1   r1   r2   eager_attention_forwardq   s   
&ry   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r7   r   r3   r8   rm   )rA   stackflatten)rF   x1Zx2Zrot_xr1   r1   r2   rotate_half   s   r}   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r:   )r;   r<   	unsqueezer}   rE   )	qkrC   rD   rG   Zunsqueeze_dimr;   Zq_embedZk_embedr1   r1   r2   apply_rotary_pos_emb   s   

r   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr!   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jd | jj dkrw|j| _d S d | _d S )Nrc   g      TrW   r   r   )r(   r)   r!   r   getattrrU   Znum_attention_headsrc   ra   rp   rk   attention_dropoutZ	is_causalrQ   LinearZattention_biasq_projk_projv_projo_projsliding_window_patternsliding_windowr-   r!   r   r/   r1   r2   r)      s0   
zCohere2Attention.__init__r[   position_embeddingsrj   past_key_valuecache_positionrs   r^   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}| jd urGt|	|
||\}	}
|d ur||| j|d}|	|
|| j
|\}
}|d ur| jjdkr|j d }|
d d d d d |d d f |d d d d d |d d f }
}t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| |	|
||f| jsdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr3   r   r7   )rD   rC   r   r   flash_attention_2eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.re   )rl   rk   r   )r>   rc   r   viewrB   r   r   r   r   updater   r!   _attn_implementationry   r+   loggerwarning_oncer   ro   r   rk   r_   rr   r   )r-   r[   r   rj   r   r   rs   Zinput_shapeZhidden_shapeZquery_statesrt   ru   rC   rD   Zcache_kwargsseq_lenZattention_interfacerx   rv   r1   r1   r2   rH      sR   	

B	

zCohere2Attention.forwardrI   )NN)rJ   rK   rL   __doc__r   r   intr)   rA   Tensorr   r	   
LongTensorr   r   rH   rN   r1   r1   r/   r2   r      s(    r   c                       s$   e Zd Z fddZdd Z  ZS )
Cohere2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r(   r)   r!   rU   Zintermediate_sizerQ   r   	gate_projup_proj	down_projr   Z
hidden_actact_fnr-   r!   r/   r1   r2   r)     s   
zCohere2MLP.__init__c                 C   s$   |  | | || | }|S rI   )r   r   r   r   )r-   rF   r   r1   r1   r2   rH     s    zCohere2MLP.forwardr\   r1   r1   r/   r2   r     s    
r   c                       s   e Zd Zdedef fddZeddd							dd
ejde	ejejf de
ej de
e de
e de
e de
ej dee de	eje
e	ejejf  f fddZ  ZS )Cohere2DecoderLayerr!   r   c                    sb   t    |j| _t||| _t|| _t|j|jd| _	|| _
|d | j
j dk| _|j| _d S )NrU   rV   r   r   )r(   r)   rU   r   	self_attnr   mlprO   layer_norm_epsinput_layernormr!   r   
is_slidingr   r   r/   r1   r2   r)     s   

zCohere2DecoderLayer.__init__Zlast_cache_positionz4.53.0)versionNFr[   r   rj   r   r   	use_cacher   rs   r^   c              
   K   s<  | j rn|durnt|jd | j}	| jjdkr"|dd|	 df }nLt|jj	}
tj
tj|tjd| j d}t||
|}|d |	 d }tj|dd}tjt	|	|jd |jd	}||7 }|dddddd|f }|}| |}| jd|||||||d
|\}}| |}|| | }|f}|r||f7 }|S )ax  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
        Nr   r   r:   Zdiagonalr3   r   )minr.   )r[   r   rj   r   r   r   r   r1   )r   maxr>   r   r!   r   rA   finfor;   r   ZtrilZ	ones_likeboolwhereclamparanger.   r   r   r   )r-   r[   r   rj   r   r   r   r   rs   Zeffective_seq_len	min_dtypeZsliding_window_maskoffsetZmask_indexesZresidualZhidden_states_attentionZself_attn_weightsZhidden_states_mlpoutputsr1   r1   r2   rH   &  sF    



zCohere2DecoderLayer.forward)NNFFN)rJ   rK   rL   r   r   r)   r   rA   r   r   r   r	   r   r   r   r   FloatTensorrH   rN   r1   r1   r/   r2   r     s6    

	
r   c                   @   sH   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdd ZdS )Cohere2PreTrainedModelmodelTr   past_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )Nre   )rY   stdg      ?)r!   Zinitializer_ranger?   rQ   r   rS   dataZnormal_rW   Zzero_	Embeddingpadding_idxrO   Zfill_)r-   rf   r   r1   r1   r2   _init_weights  s   


z$Cohere2PreTrainedModel._init_weightsN)rJ   rK   rL   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendr   r1   r1   r1   r2   r     s    r   c                       s  e Zd Zdef fddZdd Zdd Zee									d!d	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZe
 	d"d
ee
jdf de
jde
jdedef
ddZed
e
jdedede
jde
jdefdd Z  ZS )#Cohere2Modelr!   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r1   )r   ).0r   r!   r1   r2   
<listcomp>  s    z)Cohere2Model.__init__.<locals>.<listcomp>r   r   F)r(   r)   Zpad_token_idr   
vocab_sizerQ   r   rU   embed_tokensZ
ModuleListrangeZnum_hidden_layerslayersrO   r   normr    
rotary_embgradient_checkpointing	post_initr   r/   r   r2   r)     s   zCohere2Model.__init__c                 C      | j S rI   r   r-   r1   r1   r2   get_input_embeddings     z!Cohere2Model.get_input_embeddingsc                 C   
   || _ d S rI   r   r-   ri   r1   r1   r2   set_input_embeddings     
z!Cohere2Model.set_input_embeddingsN	input_idsrj   rG   r   inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr^   c
              
   K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|r]|d u r]| js]|j
\}}}t| j |||j| jd}|	d u ry|d uri| nd}tj|||j
d  |jd}	|d u r|	d}| |||	||}|}| ||}|rdnd }|rdnd }| jD ]%}|r||f7 }||f||||||	d	|
}|d }|r||d f7 }q| |}|r||f7 }t||||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)Zmax_batch_sizeZmax_cache_lenr;   r.   r   r   r   r1   )r   rj   r   r   r   r   )last_hidden_stater   r[   
attentions)r!   r   r   r   
ValueErrorr   ro   r   r   r   r>   r
   r;   r.   Zget_seq_lengthrA   r   r~   _update_causal_maskr   r   r   r   )r-   r   rj   rG   r   r   r   r   r   r   r   
batch_sizer   _Zpast_seen_tokensrw   r[   r   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr1   r1   r2   rH     s   






zCohere2Model.forwardFr   input_tensorc              	   C   s   | j jdkr|S | j jdkrt|tjrt|}|S |j|j}}|jd }t|t	t
fr2| }	n|d ur;|jd n|jd }	| j|||	||||jd d}
|
S )Nr   Zflex_attentionr   r3   r   sequence_lengthtarget_lengthr;   r.   r   r   )r!   r   r?   rA   r   r   r;   r.   r>   r
   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)r-   rj   r   r   r   r   r;   r.   r   r   rw   r1   r1   r2   r     s*   

	z Cohere2Model._update_causal_maskr   r   r;   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer;   r.   r   r   r   r3   r   )r9   rA   r   r   fullr.   Ztriur   r_   r=   cloner>   rE   Zmasked_fill)rj   r   r   r;   r   r   rs   rw   r   Zmask_lengthZpadding_maskr1   r1   r2   r   =  s,    $
6  zBCohere2Model._prepare_4d_causal_attention_mask_with_cache_position)	NNNNNNNNN)F)rJ   rK   rL   r   r)   r   r   r   r   r   rA   r   r   r
   r   r   r   r   r   rH   rM   r   r   staticmethodr   r;   r   rN   r1   r1   r/   r2   r     s    	
`&r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)rJ   rK   rL   r1   r1   r1   r2   r   u  s    r   c                       s6  e Zd ZdgZddiZddgdgfiZdef fddZd	d
 Zdd Z	dd Z
dd Zdd Zdd Zee											d)deej deej deej deeeeej f  deej deej dee dee dee d eej d!eeejf d"ee d#efd$d%Z						&	d*d'd(Z  ZS )+Cohere2ForCausalLMzlm_head.weightlm_headZcolwise_repr[   logitsr!   c                    sP   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|   d S r   )r(   r)   r   r   r   rQ   r   rU   r   logit_scaleZtie_word_embeddingsr   r   r/   r1   r2   r)   ~  s   
zCohere2ForCausalLM.__init__c                 C   s   | j jS rI   r   r   r   r1   r1   r2   r     s   z'Cohere2ForCausalLM.get_input_embeddingsc                 C   s   || j _d S rI   r   r   r1   r1   r2   r     s   z'Cohere2ForCausalLM.set_input_embeddingsc                 C   r   rI   r   r   r1   r1   r2   get_output_embeddings  r   z(Cohere2ForCausalLM.get_output_embeddingsc                 C   r   rI   r   )r-   Znew_embeddingsr1   r1   r2   set_output_embeddings  r   z(Cohere2ForCausalLM.set_output_embeddingsc                 C   r   rI   r   )r-   decoderr1   r1   r2   set_decoder  r   zCohere2ForCausalLM.set_decoderc                 C   r   rI   r   r   r1   r1   r2   get_decoder  r   zCohere2ForCausalLM.get_decoderNr   r   rj   rG   r   r   labelsr   r   r   r   logits_to_keeprs   r^   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }|| j	 }d}|dur]| j
d||| j jd|}t|||j|j|jdS )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, Cohere2ForCausalLM

        >> model = Cohere2ForCausalLM.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   rj   rG   r   r   r   r   r   r   )r   r   r   )lossr   r   r[   r   r1   )r!   r   r   r   r   r?   r   slicer   r   Zloss_functionr   r   r   r[   r   )r-   r   rj   rG   r   r   r   r   r   r   r   r   rs   r   r[   Zslice_indicesr   r   r1   r1   r2   rH     s<   '

zCohere2ForCausalLM.forwardTc	              	   K   s  |d ur2|d us|d |j d kr |d d |j d  d f }n|j d |j d kr2|d d |f }|d urb|d u rb| dd }||dkd |rb|d d |j d  d f }|jtjd}|d urr|d dkrr|d d}
n
|jtjdd d}
t|tr|j	dkr| j
jdks|
d	 d ur|
d	 j \}}}|
d	 j}n|
d
 j \}}|
d
 j}| jj||| | jjj|||d}|d ur||
d< |
|||||d |
S )Nr3   r   r   )Zmemory_format)r   r   )r   r   r7   r   r   r   r   r   )rG   r   r   r   rj   )r>   longZcumsumZmasked_fill_r   rA   Zcontiguous_formatr?   r
   ndimr!   r   r.   r   r   r   r   rS   r;   r   )r-   r   r   rj   r   r   rG   r   r   rs   Zmodel_inputsr   r   r   r.   r1   r1   r2   prepare_inputs_for_generation  sX   


	z0Cohere2ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN) rJ   rK   rL   Z_tied_weights_keysZ_tp_planZ_pp_planr   r)   r   r   r   r   r   r   r   r   r   rA   r   r   r   r	   r   r   r   r   r   r   r   rH   r   rN   r1   r1   r/   r2   r   x  sv    	
Mr   )r   r   r   )re   )Nr   )Ctypingr   r   r   r   r   rA   Ztorch.nnrQ   Zactivationsr   Zcache_utilsr	   r
   r   Z
generationr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zconfiguration_cohere2r   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerrJ   r   Moduler    rO   r   r   rd   r<   ry   r}   r   r   r   r   r   r   r   r   __all__r1   r1   r1   r2   <module>   sj   
"

Zd Y C