o
    Zh.A                     @   s  d dl mZmZmZmZmZ d dlZd dlZd dlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( e r}d dl)m*Z* ddl+m,Z, e-e.Z/dZ0G dd de"Z1G dd deZ2G dd deZ3G dd de$Z4G dd de#Z5G dd deZ6G dd  d e!Z7G d!d" d"e Z8G d#d$ d$eZ9g d%Z:dS )&    )CallableListOptionalTupleUnionN)nn   )CacheSlidingWindowCacheStaticCache)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONS)Unpack)is_torch_flex_attn_availablelogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfig)	BlockMask)make_flex_block_causal_maskzmistralai/Mistral-7B-v0.1c                       s   e Zd Z fddZ  ZS )
MistralMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFZbias)	super__init__r   Linearhidden_sizeZintermediate_sizeZ	gate_projZup_projZ	down_projselfconfig	__class__ Z/var/www/auris/lib/python3.10/site-packages/transformers/models/mistral/modular_mistral.pyr'   *   s   zMistralMLP.__init__)__name__
__module____qualname__r'   __classcell__r/   r/   r-   r0   r$   )   s    r$   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )MistralAttentionr,   	layer_idxc                    s   t    t|dd p|j|j | _tj|j|j| j dd| _tj|j|j	| j dd| _
tj|j|j	| j dd| _tj|j| j |jdd| _d S )Nhead_dimFr%   )r&   r'   getattrr)   Znum_attention_headsr7   r   r(   q_projZnum_key_value_headsk_projv_projo_projr+   r,   r6   r-   r/   r0   r'   2   s   
 zMistralAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   sT  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkrw| jjdkrq|ddrqtd	 nt| jj }|| |	|
||f| jsd
n| j| jt| jdd d|\}}|jg |dR   }| |}||fS )Nr    r   )sincosrB   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)ZdropoutscalingrL   )shaper7   r9   viewZ	transposer:   r;   r   updater6   r   r,   _attn_implementationgetloggerZwarning_oncer   trainingZattention_dropoutrM   r8   reshape
contiguousr<   )r+   r>   r?   r@   rA   rB   rC   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesrG   rF   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr/   r/   r0   forward:   sB   		

zMistralAttention.forward)NN)r1   r2   r3   r!   intr'   torchTensorr   r   r	   
LongTensorr   r   rW   r4   r/   r/   r-   r0   r5   1   s&    r5   c                       s&   e Zd Zdedef fddZ  ZS )MistralDecoderLayerr,   r6   c                    s*   t  || t||d| _t|| _d S )N)r,   r6   )r&   r'   r5   Z	self_attnr$   Zmlpr=   r-   r/   r0   r'   n   s   zMistralDecoderLayer.__init__)r1   r2   r3   r!   rX   r'   r4   r/   r/   r-   r0   r\   m   s    r\   c                   @      e Zd ZdS )MistralPreTrainedModelNr1   r2   r3   r/   r/   r/   r0   r^   t       r^   c                       s   e Zd Zdef fddZ	ddeejdf dejdejd	ed
e	f
ddZ
edejdededejdejdeded	efddZ  ZS )MistralModelr,   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r/   )r\   ).0r6   r,   r/   r0   
<listcomp>|   s    z)MistralModel.__init__.<locals>.<listcomp>)r&   r'   r   Z
ModuleListrangeZnum_hidden_layersZlayersr*   r-   rc   r0   r'   y   s   
zMistralModel.__init__Fr@   r"   input_tensorrB   past_key_valuesrJ   c              
   C   s  | j jdkr2|d ur&|d ur&|d d df   | d k}|r&td|d ur0d|v r0|S d S | j jdkrDt|tjrBt	|}|S |d urL|
 nd}t|t}t|t}	| j jdkrs|ss|	ss|sstj|||| j j| jdrsd S |j}
t|
j}|jd	 }|	s|r| }nt|tjr|jd n|| d	 }| j||||
||jd | j |d
}| j jdkr|d ur|jjdv r|st||}|S )NZflash_attention_2rE   r   zYou are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. rK   Zflex_attentionrI   )inputs_embedsZpast_key_values_lengthrL   Zis_trainingr    )sequence_lengthtarget_lengthdtyperB   
batch_sizer,   rg   )cudaZxpuZnpu)r,   rQ   sumitemsize
ValueError
isinstancerY   rZ   r#   Zget_seq_lengthr   r
   r   Z_ignore_causal_mask_sdparL   rT   rk   finfominrN   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypeZ_unmask_unattended)r+   r@   rf   rB   rg   rJ   Zis_padding_rightZpast_seen_tokensZusing_static_cacheZusing_sliding_window_cacherk   	min_dtyperi   rj   causal_maskr/   r/   r0   _update_causal_mask   st   $





z MistralModel._update_causal_maskri   rj   rk   rl   c                 C   s  | dur|   dkr| }|S t|j}	tj||f|	||jd}tj||jd|ddk}
| }t	|ddr\|j
dur\t|trF||kr\tj||jd|dd|j
 k}|
| ||
9 }|ddddddf |ddd}| dur| }| jd |kr| ddd|f } | jd }|ddddddd|f | ddddddf |j }|d	k}|ddddddd|f ||	|ddddddd|f< |S )
a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`MistralConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        N   )Z
fill_valuerk   rv   )rv   rE   r    Zuse_sliding_windowTr   )dimrY   rs   rt   fullrv   ZarangerU   Zget_text_configr8   rL   rr   r
   Zbitwise_or_expandclonerN   toZmasked_fill)r@   ri   rj   rk   rB   rl   r,   rg   ry   rx   Zdiagonal_attend_maskZtext_configZsliding_attend_maskZmask_lengthZpadding_maskr/   r/   r0   ru      s@   ! 
$
6  zBMistralModel._prepare_4d_causal_attention_mask_with_cache_position)F)r1   r2   r3   r!   r'   r   rY   rZ   r	   boolrz   staticmethodrX   rk   ru   r4   r/   r/   r-   r0   ra   x   s@    
Vra   c                   @   r]   )MistralForCausalLMNr_   r/   r/   r/   r0   r     r`   r   c                   @   r]   )MistralForTokenClassificationNr_   r/   r/   r/   r0   r     r`   r   c                   @   r]   ) MistralForSequenceClassificationNr_   r/   r/   r/   r0   r   #  r`   r   c                       s   e Zd ZdZ fddZdd Zdd Z									dd	eej	 d
eej
 deej	 deeeeej f  deej deej	 deej	 dee dee defddZ  ZS )MistralForQuestionAnsweringmodelc                    s   t  | t|| _| `d S N)r&   r'   ra   r   Ztransformerr*   r-   r/   r0   r'   *  s   
z$MistralForQuestionAnswering.__init__c                 C   s   | j jS r   r   Zembed_tokens)r+   r/   r/   r0   get_input_embeddings/  s   z0MistralForQuestionAnswering.get_input_embeddingsc                 C   s   || j _d S r   r   )r+   valuer/   r/   r0   set_input_embeddings2  s   z0MistralForQuestionAnswering.set_input_embeddingsN	input_idsr@   position_idsrg   rh   start_positionsend_positionsrJ   output_hidden_statesrD   c
              	   K   s   | j |||||||	d}|j}| |}|jddd\}}|d }|d }d}|durA|durA| j||||fi |
}t||||j|j	dS )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        )r@   r   rg   rh   rJ   r   r    rE   )r|   N)lossstart_logits
end_logitsr>   
attentions)
r   Zlast_hidden_stateZ
qa_outputssplitZsqueezerV   Zloss_functionr   r>   r   )r+   r   r@   r   rg   rh   r   r   rJ   r   rC   ZoutputsZsequence_outputZlogitsr   r   r   r/   r/   r0   rW   5  s0   

z#MistralForQuestionAnswering.forward)	NNNNNNNNN)r1   r2   r3   Zbase_model_prefixr'   r   r   r   rY   r[   rZ   r   r	   r   ZFloatTensorr   r   rW   r4   r/   r/   r-   r0   r   '  sF    	
r   )r   r   ra   r^   r   r   );typingr   r   r   r   r   rY   Ztorch.utils.checkpointr   Zcache_utilsr	   r
   r   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   Zllama.modeling_llamar   r   r   r   r   r   r   r   r   r   r   Zconfiguration_mistralr!   Z!torch.nn.attention.flex_attentionr"   Zintegrations.flex_attentionr#   Z
get_loggerr1   rS   Z_CHECKPOINT_FOR_DOCr$   r5   r\   r^   ra   r   r   r   r   __all__r/   r/   r/   r0   <module>   s:    4
< $D