
    fTh;                        S SK JrJrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
  SSKJr  SSKJrJr  S	S
KJrJrJrJrJrJr  S	SKJr  SSKJr  \R6                  " \5      r " S S\5      r " S S\R>                  5      r  " S S\R>                  5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r&/ SQr'g)    )ListOptionalTupleUnionN)nn   )ACT2FN)FlashAttentionKwargs)Unpack)is_torchdynamo_compilinglogging   )KwargsForCausalLMLlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel)MistralRMSNorm   )Mistral3Configc                       \ rS rSrSrg)Mistral3RMSNorm(    N__name__
__module____qualname____firstlineno____static_attributes__r       e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   (       r"   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Mistral3PatchMerger,   z4
Learned merging of spatial_merge_size ** 2 patches
configc                   > [         TU ]  5         Xl        UR                  R                  nUR
                  U l        U R                  R                  R                  U l        [        R                  " X R
                  S-  -  USS9U l	        g )Nr   Fbias)
super__init__r(   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr(   r/   	__class__s      r#   r-   Mistral3PatchMerger.__init__1   sn    **66"(";";++33>>YY{5L5La5O'OQ\chir"   image_featuresimage_sizesreturnc                    U Vs/ s H&  o3S   U R                   -  US   U R                   -  4PM(     nnU VVs/ s H	  u  pEXE-  PM     nnnUR                  S   n/ n[        UR                  U5      5       H  u  pX)   u  pEU
R	                  XEU5      R                  SSS5      R                  S5      n[        R                  R                  R                  XR                  U R                  S9nUR	                  XpR                  S-  -  S5      R                  5       nUR                  U5        M     [        R                  " USS9nU R                  U5      nU$ s  snf s  snnf )Nr   r   r   )kernel_sizestridedim)r1   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr0   tappendcatr3   )r4   r7   r8   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r#   forwardMistral3PatchMerger.forward:   s[   cn
cnU_]doo-z!}/OPcn 	 
 /::kdaAEk:  $)2>3G3GHX3Y)Z%K+DA%**13;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4( *[ ?:++N;)
 ;s
   -EE!)r(   r3   r1   r0   )r   r   r   r    __doc__r   r-   rF   TensorrV   r!   __classcell__r5   s   @r#   r&   r&   ,   sD    j~ jell  RWR^R^  r"   r&   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Mistral3MultiModalProjectorR   r(   c                   > [         TU ]  5         [        UR                  R                  UR
                  R                  S9U l        [        U5      U l	        [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " UR                  R                  U-  UR
                  R                  UR                   S9U l        [$        UR&                     U l        [        R                  " UR
                  R                  UR
                  R                  UR                   S9U l        g )N)epsr   r*   )r,   r-   r   r.   r/   text_configrms_norm_epsnormr&   patch_merger
isinstancevision_feature_layerintlenr   r2   multimodal_projector_biaslinear_1r	   projector_hidden_actactlinear_2)r4   r(   num_feature_layersr5   s      r#   r-   $Mistral3MultiModalProjector.__init__S   s    #F$8$8$D$D&J\J\JiJij	/7",V-H-H#"N"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r"   r7   r8   c                     U R                  U5      nU R                  X5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ N)rc   rd   rj   rl   rm   )r4   r7   r8   hidden_statess       r#   rV   #Mistral3MultiModalProjector.forwardc   sP    >2**>Gn5/m4r"   )rl   rj   rm   rc   rd   )r   r   r   r    r   r-   rF   rY   rV   r!   rZ   r[   s   @r#   r]   r]   R   s/    
~ 
 ell   r"   r]   c                       \ rS rSrSrg)Mistral3CausalLMOutputWithPastl   r   Nr   r   r"   r#   ru   ru   l   r$   r"   ru   c                       \ rS rSrSrg)Mistral3ModelOutputWithPastp   r   Nr   r   r"   r#   rx   rx   p   r$   r"   rx   c                       \ rS rSrS rSrg)Mistral3PreTrainedModelt   c                    [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [	        U[        5      (       a&  UR                  R                  R                  S5        g g )Ninitializer_rangeg        )meanstdg      ?)getattrr(   get_text_configr~   re   r   r2   weightdatanormal_r+   zero_	LayerNormfill_r   )r4   moduler   s      r#   _init_weights%Mistral3PreTrainedModel._init_weightsu   s     dkk#68S8S8U8g8ghfbii((MM&&CS&9{{&  &&( '--MM$$S)KK""$00MM$$S) 1r"   r   N)r   r   r   r    r   r!   r   r"   r#   r{   r{   t   s    *r"   r{   c            !          \ rS rSr SS\R
                  S\R                  S\\\	\
\	   4      4S jjr             SS\R                  S\R
                  S\\R                     S	\\R                     S
\\
\R
                        S\\R
                     S\\\	\
\	   4      S\\   S\\   S\\   S\\   S\\R                     S\R                  S\\   S\\\4   4S jjrSrg)Mistral3Model   Npixel_valuesr8   rf   c                    Ub  UOU R                   R                  nUR                  5        VVs0 s H  u  pVUc  M
  XV_M     nnnU R                  " U4USS.UD6n[	        U[
        5      (       a  UR                  U   nO3U V	s/ s H  oR                  U	   PM     n
n	[        R                  " U
SS9nU R                  UR                  S5      U5      nU$ s  snnf s  sn	f )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, List[int]]`, *optional*):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    image_sizes (`torch.Tensor`, *optional*):
        Tensor containing the image sizes as returned by the processor.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)r8   output_hidden_statesr;   r>   r   )r(   rf   itemsvision_towerre   rg   rr   rF   rK   multi_modal_projectorsqueeze)r4   r   r8   rf   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolr7   s               r#   get_image_features Mistral3Model.get_image_features   s    . %9$D $++JjJj 	 $*<<>C>41Q$!$>C)),uKfjuntu *C00%2%@%@AU%V"OcdOc)229=OcGd%*YYwB%?"334J4R4RST4UWbc D es   	CCC	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsr   return_dictcache_positionr   r9   c                 `   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUGb$  U R                  UUUS9nXR                   R                  :H  R                  S5      nUR                  U5      R                  UR                  5      n[        5       (       d{  UU   R                  5       UR                  5       :w  aV  XR                   R                  :H  R                  5       nUR                   S   UR                   S   -  n[        SU SU 35      eUR                  UR                  UR"                  5      nUR%                  UU5      nU R&                  " SUUUUUU	U
S	US
.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r   rf   r8   r;   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r   r   r   r   r   r   r   r   r   )last_hidden_stater   rr   
attentionsimage_hidden_statesr   )r(   r   r   use_return_dictrf   
ValueErrorget_input_embeddingsr   image_token_idrE   	expand_astodevicer   numelsumr@   dtypemasked_scatterlanguage_modelrx   r   r   rr   r   )r4   r   r   r   r   r   r   rf   r   r   r   r   r   r8   r   r7   special_image_maskn_image_tokensn_image_featuresoutputss                       r#   rV   Mistral3Model.forward   sj   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ#(Av    557	BM#!44)%9' 5 N #,{{/I/I"I!T!TUW!X!3!=!=m!L!O!OP]PdPd!e+---@R2S2Y2Y2[_m_s_s_u2u"+{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r"   r   rq   )NNNNNNNNNNNNN)r   r   r   r    rF   FloatTensorrY   r   r   rg   r   r   
LongTensorboolr   r
   r   rx   rV   r!   r   r"   r#   r   r      s   
 AE	&''& \\& 'uS$s)^'<=	&T '+*.1537=A59@D$(,0/3&*59$(J
##J
 ''J
 !.	J

 u//0J
 "$u'8'8"9:J
   1 12J
 'uS$s)^'<=J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 \\J
 -.J
  
u11	2!J
 J
r"   r   c            #          \ rS rSr              SS\R
                  S\R                  S\\R                     S\\R
                     S\\	\R                        S\\R                     S	\\R
                     S
\\
   S\\
   S\\
   S\\
   S\\R
                     S\\\R                  4   S\\R                     S\\   S\\\4   4 S jjrSrg) Mistral3ForConditionalGeneration   Nr   r   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr8   r   r9   c                 F   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUUU	U
SUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXpR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

>>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
>>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

>>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is the image?The image depicts two cats lying on a pink blanket."
```NT)r   r   r   r   r   r   r   r   r   r   r   r8   r   )logitsr   
vocab_size)lossr   r   rr   r   r   r   )r(   r   r   r   modelre   rg   slicelm_headloss_functionra   r   ru   r   rr   r   r   )r4   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   r   r   rr   slice_indicesr   r   s                        r#   rV   (Mistral3ForConditionalGeneration.forward   sM   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%)%+'/!5)#
 
   
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r"   r   )NNNNNNNNNNNNr   N)r   r   r   r    rF   r   r   r   rY   r   r   r   rg   r   r   r   ru   rV   r!   r   r"   r#   r   r      sq    '+*.1537=A59-1$(,0/3&*5934.2U
##U
 ''U
 !.	U

 u//0U
 "$u'8'8"9:U
   1 12U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
 c5<</0U
 ell+U
  *+!U
" 
u44	5#U
 U
r"   r   )r   r{   r   )(typingr   r   r   r   rF   r   activationsr	   modeling_flash_attention_utilsr
   processing_utilsr   utilsr   r   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler&   r]   ru   rx   r{   r   r   __all__r   r"   r#   <module>r      s     0 /   ! B & 6  6 2 
		H	%	n 	#")) #L")) 4	%@ 		": 	*2 *$s
J s
lV
'D V
rr"   