
    fTh.A                        S SK JrJrJrJrJr  S SKrS SKrS SKJr  SSK	J
r
JrJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  \" 5       (       a  S SK)J*r*  SSK+J,r,  \RZ                  " \.5      r/Sr0 " S S\"5      r1 " S S\5      r2 " S S\5      r3 " S S\$5      r4 " S S\#5      r5 " S S\5      r6 " S S \!5      r7 " S! S"\ 5      r8 " S# S$\5      r9/ S%Qr:g)&    )CallableListOptionalTupleUnionN)nn   )CacheSlidingWindowCacheStaticCache)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONS)Unpack)is_torch_flex_attn_availablelogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfig)	BlockMask)make_flex_block_causal_maskzmistralai/Mistral-7B-v0.1c                   (   ^  \ rS rSrU 4S jrSrU =r$ )
MistralMLP)   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__s     c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mistral/modular_mistral.pyr,   MistralMLP.__init__*   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWX    )r2   r0   r1   )__name__
__module____qualname____firstlineno__r,   __static_attributes____classcell__r6   s   @r7   r&   r&   )   s    Y Yr9   r&   c                   B  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )MistralAttention1   r5   	layer_idxc                 f  > [         TU ]  5         [        USS 5      =(       d    UR                  UR                  -  U l        [        R                  " UR                  UR                  U R
                  -  SS9U l        [        R                  " UR                  UR                  U R
                  -  SS9U l
        [        R                  " UR                  UR                  U R
                  -  SS9U l        [        R                  " UR                  U R
                  -  UR                  SS9U l        g )Nhead_dimFr)   )r+   r,   getattrr.   num_attention_headsrF   r   r-   q_projnum_key_value_headsk_projv_projo_projr4   r5   rD   r6   s      r7   r,   MistralAttention.__init__2   s    
D9mV=O=OSYSmSm=mii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr9   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc           
         UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S	5        O[         U R                  R                     nU" U U	U
UU4U R"                  (       d  S
OU R$                  U R&                  [)        U R                  SS 5      S.UD6u  nnUR*                  " / UQSP76 R-                  5       nU R/                  U5      nUU4$ )Nr!   r   )sincosrT   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)dropoutscalingr_   )shaperF   rI   view	transposerK   rL   r   updaterD   r    r5   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutra   rG   reshape
contiguousrM   )r4   rP   rQ   rR   rS   rT   rU   input_shapehidden_shapequery_states
key_statesvalue_statesrZ   rY   cache_kwargsattention_interfaceattn_outputattn_weightss                     r7   forwardMistralAttention.forward:   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ "));;;;FFHkk+.L((r9   )rF   rK   rM   rI   rL   )NN)r:   r;   r<   r=   r"   intr,   torchTensorr   r   r
   
LongTensorr   r   rw   r>   r?   r@   s   @r7   rB   rB   1   s    l} l l +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0) 0)r9   rB   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )MistralDecoderLayerm   r5   rD   c                 `   > [         TU ]  X5        [        XS9U l        [	        U5      U l        g )N)r5   rD   )r+   r,   rB   	self_attnr&   mlprN   s      r7   r,   MistralDecoderLayer.__init__n   s(    +)Mf%r9   )r   r   )	r:   r;   r<   r=   r"   ry   r,   r>   r?   r@   s   @r7   r~   r~   m   s    &} & & &r9   r~   c                       \ rS rSrSrg)MistralPreTrainedModelt    Nr:   r;   r<   r=   r>   r   r9   r7   r   r   t       r9   r   c                   
  ^  \ rS rSrS\4U 4S jjr SS\\R                  S4   S\R                  S\R                  S\	S	\
4
S
 jjr\S\R                  S\S\S\R                  S\R                  S\S\S\	4S j5       rSrU =r$ )MistralModelx   r5   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf N)r+   r,   r   
ModuleListrangenum_hidden_layersr~   layersrN   s      r7   r,   MistralModel.__init__y   sH     mmEJ6KcKcEdeEd	 3Ede
es   ArR   r#   input_tensorrT   past_key_valuesr]   c                    U R                   R                  S:X  a]  UbN  UbK  US S 2S4   R                  5       R                  5       UR	                  5       S   :g  nU(       a  [        S5      eUb  SU;   a  U$ g U R                   R                  S:X  a,  [        U[        R                  5      (       a  [        U5      nU$ Ub  UR                  5       OSn[        U[        5      n[        U[        5      n	U R                   R                  S:X  aQ  U(       dJ  U	(       dC  U(       d<  [        R                  " UUUU R                   R                  U R                   S9(       a  g UR"                  n
[        R$                  " U
5      R&                  nUR(                  S	   nU	(       d  U(       a  UR+                  5       nO5[        U[        R                  5      (       a  UR(                  S   OX|-   S	-   nU R-                  UUUU
UUR(                  S   U R                   US
9nU R                   R                  S:X  a:  Ub7  UR.                  R0                  S;   a  U(       d  [        R2                  " X5      nU$ )Nflash_attention_2rX   r   zYou are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. r^   flex_attentionr\   )inputs_embedspast_key_values_lengthr_   is_trainingr!   )sequence_lengthtarget_lengthdtyperT   
batch_sizer5   r   )cudaxpunpu)r5   rf   sumitemsize
ValueError
isinstancerz   r{   r$   get_seq_lengthr   r   r   _ignore_causal_mask_sdpar_   rj   r   finfominrb   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetype_unmask_unattended)r4   rR   r   rT   r   r]   is_padding_rightpast_seen_tokensusing_static_cacheusing_sliding_window_cacher   	min_dtyper   r   causal_masks                  r7   _update_causal_mask MistralModel._update_causal_mask   s;    ;;++/BB)o.I#1!R%#8#<#<#>#C#C#EIZIZI\]^I_#_ #$a 
 )c^.C%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`a'E%/AS%T" KK,,6'+E%%>>*'7#{{99 MM ""KK&**	&,,Q/%);+??AM
 nell;; $$R(%7!;  PP+')#))!,;;+ Q 	
 KK,,6*%%**.DD%
 1CCK[Kr9   r   r   r   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n	[        R                  " X4XUR
                  S9n[        R                  " X$R
                  S9UR                  SS5      :  n
UR                  5       n[        USS5      (       av  UR                  bi  [        U[        5      (       a  X:  aO  [        R                  " X$R
                  S9UR                  SS5      UR                  -
  :*  nU
R                  U5        X-  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                   S   U:  a  U SS2SU24   n U R                   S   nUSS2SS2SS2SU24   U SS2SSSS24   R#                  UR
                  5      -   nUS	:H  nUSS2SS2SS2SU24   R%                  X5      USS2SS2SS2SU24'   U$ )
aS  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
    config (`MistralConfig`):
        The model's configuration class
    past_key_values (`Cache`):
        The cache class that is being used currently to generate
N   )
fill_valuer   r   )r   rX   r!   use_sliding_windowTr   )dimrz   r   r   fullr   arangerl   get_text_configrG   r_   r   r   bitwise_or_expandclonerb   tomasked_fill)rR   r   r   r   rT   r   r5   r   r   r   diagonal_attend_masktext_configsliding_attend_maskmask_lengthpadding_masks                  r7   r   BMistralModel._prepare_4d_causal_attention_mask_with_cache_position   s   B %.*<*<*>!*C(K@ = E*..I** 0Y\j\q\qK $)<<F[F[#\_m_u_uA` $  !002K{$8$??KD^D^Dj "/3EFF/Ji*/,,}MbMb*c&..r158R8RR+' )445HI/K%dD!Q&67>>z1bRTUK))//1!''+m;%3A~~4E%FN,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r9   )r   )F)r:   r;   r<   r=   r"   r,   r   rz   r{   r
   boolr   staticmethodry   r   r   r>   r?   r@   s   @r7   r   r   x   s    
} 
 #(TellK78T llT 	T
 T  Tl BBB B {{	B
 B B B B Br9   r   c                       \ rS rSrSrg)MistralForCausalLMi  r   Nr   r   r9   r7   r   r     r   r9   r   c                       \ rS rSrSrg)MistralForTokenClassificationi  r   Nr   r   r9   r7   r   r     r   r9   r   c                       \ rS rSrSrg) MistralForSequenceClassificationi#  r   Nr   r   r9   r7   r   r   #  r   r9   r   c                   R  ^  \ rS rSrSrU 4S jrS rS r         SS\\	R                     S\\	R                     S\\	R                     S	\\\\\	R                     4      S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\4S jjrSrU =r$ )MistralForQuestionAnsweringi'  modelc                 H   > [         TU ]  U5        [        U5      U l        U ?g r   )r+   r,   r   r   transformerr3   s     r7   r,   $MistralForQuestionAnswering.__init__*  s"     !&)
r9   c                 .    U R                   R                  $ r   r   embed_tokens)r4   s    r7   get_input_embeddings0MistralForQuestionAnswering.get_input_embeddings/  s    zz&&&r9   c                 $    XR                   l        g r   r   )r4   values     r7   set_input_embeddings0MistralForQuestionAnswering.set_input_embeddings2  s    "'

r9   	input_idsrR   position_idsr   r   start_positionsend_positionsr]   output_hidden_statesrV   c
           
         U R                  UUUUUUU	S9nUR                  nU R                  U5      nUR                  SSS9u  pUR	                  S5      R                  5       nUR	                  S5      R                  5       nSnUb  Ub  U R                  " XXg40 U
D6n[        UUUUR                  UR                  S9$ )a  
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the start of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the end of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
)rR   r   r   r   r]   r   r!   rX   )r   N)lossstart_logits
end_logitsrP   
attentions)
r   last_hidden_state
qa_outputssplitsqueezerm   loss_functionr   rP   r   )r4   r   rR   r   r   r   r   r   r]   r   rU   outputssequence_outputlogitsr   r   r   s                    r7   rw   #MistralForQuestionAnswering.forward5  s    0 ,0::)%+'/!5 ,6 ,
 "331#)<<r<#: #++B/::<''+668
&=+D%%libhiD+%!!//))
 	
r9   )r   )	NNNNNNNNN)r:   r;   r<   r=   base_model_prefixr,   r   r   r   rz   r|   r{   r   r
   r   FloatTensorr   r   rw   r>   r?   r@   s   @r7   r   r   '  s   
'(
 151537KO596:48,0/33
E,,-3
 !.3
 u//0	3

 "%tE4E4E/F(F"GH3
   1 123
 "%"2"233
   0 013
 $D>3
 'tn3
 
&3
 3
r9   r   )r   r   r   r   r   r   );typingr   r   r   r   r   rz   torch.utils.checkpointr   cache_utilsr
   r   r   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   r    configuration_mistralr"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerr:   rh   _CHECKPOINT_FOR_DOCr&   rB   r~   r   r   r   r   r   r   __all__r   r9   r7   <module>r     s    9 9    A A > B U 5 & :    1  !!;J 
		H	%1 Y Y9)~ 9)x&+ &	1 	`: `F	) 		$? 		'E 	A
"; A
Hr9   