
    fTh'T                        S r SSKJr  SSKJrJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  \R:                  " \5      rS r  " S S\	RB                  5      r"S r#S,S jr$ S-S\	RB                  S\RJ                  S\RJ                  S\RJ                  S\\RJ                     S\&S\&4S jjr' " S S\	RB                  5      r( " S S \	RB                  5      r) " S! S"\	RB                  5      r* " S# S$\	RB                  5      r+ " S% S&\	RB                  5      r,\ " S' S(\5      5       r-S) r.\ " S* S+\-5      5       r/S+S(/r0g).zPyTorch Pixtral model.    )Callable)OptionalTupleUnionN)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc                    / nU  H  nUR                   SS  u  pE[        R                  " [        R                  " U5      [        R                  " U5      SS9n[        R                  " USS9R                  SS5      R                  SS5      u  pxXq-  U-   n	UR                  U	S S 2S4   5        M     [        R                  " U5      $ )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr1   %   s    I"BC(~~ell62ELL4GRVWTr2::2qAGG2N 6)QT# # 99Y    c                   h   ^  \ rS rSrSrSU 4S jjr\R                  " 5       \S 5       5       r	Sr
U =r$ )PixtralRotaryEmbedding0   a  
The key with pixtral embedding is just that you have a frequency for each pixel positions.
If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
is given by indexing the pre_computed frequency on the width and height.

What you output is of dimension (batch, height * width, dim) with dim the embed dim.

This simply means that for each image hidden state, you are going to add
a corresponding positional embedding, based on its index in the grid.
c                 ~  > [         T
U ]  5         SU l        UR                  U l        UR
                  U l        UR                  UR                  -  nSU R                  [        R                  " SU R                  S5      R                  5       U R                  -  -  -  n[        R                  " X4R                  S9n[        R                  " X4R                  S9n[        R                  " XTS S S2   5      R                  5       n[        R                  " XdSS S2   5      R                  5       n[        R                  " US S 2S S S 24   R                  SUS5      US S S 2S S 24   R                  USS5      /SS9R!                  SU R                  S-  5      n	U R#                  S	[        R                  " X4SS9S
S9  g )Ndefault      ?r   r   )devicer   r   r   inv_freqF)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r    floatr9   outerr%   repeatr"   register_buffer)selfconfigr9   max_patches_per_sidefreqshwfreqs_hfreqs_wr:   	__class__s             r0   r=   PixtralRotaryEmbedding.__init__<   ss   "??%%	%00F4E4EEtyyU\\!TXXq%A%G%G%IDHH%TUVLL-llCLL-llC++ass,224++aqt!t-335994
#**1.BAFa
#**+?AF 
 '"dhh!m
$ 	 	ZH3GR)P]bcr2   c                    U R                   U   n[        UR                  R                  [        5      (       a0  UR                  R                  S:w  a  UR                  R                  OSn[
        R                  " USS9   UnUR                  5       nUR                  5       nS S S 5        WR                  UR                  S9WR                  UR                  S94$ ! , (       d  f       N@= f)NmpscpuF)device_typeenabled)dtype)r:   
isinstancer9   typestrr   autocastcossintorW   )rH   xposition_idsrK   rU   embr\   r]   s           r0   forwardPixtralRotaryEmbedding.forwardU   s     l+'1!((--'E'E!((--[`J`ahhmmfk^^UCC'')C'')C D
 vvAGGv$cff177f&;;; DCs    #C
C+)rA   r   r>   N)__name__
__module____qualname____firstlineno____doc__r=   r   no_gradr   rb   __static_attributes____classcell__rP   s   @r0   r4   r4   0   s0    	d2 ]]_	<  	<r2   r4   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r%   )r_   x1x2s      r0   rotate_halfrq   d   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerq   )qkr\   r]   r`   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embry   k   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr   r   )r   rW   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32r^   rW   r   r   
contiguous)
rz   r{   r|   r}   r~   r   r   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r2   c                     ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\	\R                  \R                  4      S\\
   S\\   S	\	\R                  \\R                     4   4S
 jjrSrU =r$ )PixtralAttention   zA
Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        SU l        U R                  S-  U l	        SU l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        g )NFg      ࿩bias)r<   r=   rI   hidden_size	embed_dimnum_attention_heads	num_headsr?   	is_causalr   attention_dropoutr   r   Lineark_projv_projq_projo_projrH   rI   rP   s     r0   r=   PixtralAttention.__init__   s    ++33$..8}}d*//iiUKiiUKiiUKiiUKr2   hidden_statesr~   position_embeddingsoutput_attentionsr   returnc                 J   UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgU R
                  U R                  5      R                  SS5      n	U
R	                  XgU R
                  U R                  5      R                  SS5      n
UR	                  XgU R
                  U R                  5      R                  SS5      nUu  p[        XXSS9u  p[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU R                  R                  S:X  a"  US	   R                  UR                   S
S9US	'   SnU" U U	U
UU4U R"                  (       d  SOU R$                  U R&                  S.UD6u  nnUR)                  XgS5      R+                  5       nU R-                  U5      nU(       d  SnUU4$ )z#Input shape: Batch x Time x Channelr   r   r   )rv   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.flash_attention_2r`   T)non_blockingN        )r   r   r   )sizer   r   r   viewr   r?   r   ry   r   rI   _attn_implementationloggerwarning_oncer   r^   r9   r   r   r   r"   r   r   )rH   r   r~   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesr\   r]   attention_interfacer   r   s                    r0   rb   PixtralAttention.forward   s    "/!3!3!5
Q{{=1[[/
{{=1#((dnndmm\ffghjkl__Z$..$--Xbbcdfgh
#((dnndmm\ffghjkl&#7RUjk#l (?;;++w6{{//69>O##L
 '>dkk>^>^&_# ;;++/BB%+N%;%>%>}?S?Sbf%>%gF>"!N$7	%
  $}}C$,,LL	%
 	%
!\ "))*rBMMOkk+. LL((r2   )rI   r   r   r?   r   r   r   r   r   r   r   )NNF)re   rf   rg   rh   ri   r=   r   Tensorr   r   boolr   r
   rb   rk   rl   rm   s   @r0   r   r      s    L* 26KO,16)||6) !.6) &eELL%,,,F&GH	6)
 $D>6) -.6) 
u||Xell33	46) 6)r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
PixtralMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g )NFr   )r<   r=   rI   r   intermediate_sizer   r   	gate_projup_proj	down_projr	   
hidden_actact_fnr   s     r0   r=   PixtralMLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rd   )r   r   r   r   )rH   r_   r   s      r0   rb   PixtralMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r   rI   r   r   r   r   r   )re   rf   rg   rh   r=   rb   rk   rl   rm   s   @r0   r   r      s    0 r2   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )PixtralRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
PixtralRMSNorm is equivalent to T5LayerNorm
N)r<   r=   r   	Parameterr   onesweightvariance_epsilon)rH   r   epsrP   s      r0   r=   PixtralRMSNorm.__init__  s/     	ll5::k#:; #r2   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   r   T)keepdim)	rW   r^   r   r   powmeanrsqrtr   r   )rH   r   input_dtypevariances       r0   rb   PixtralRMSNorm.forward	  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   r   r   rH   s    r0   
extra_reprPixtralRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr2   )r   r   )gư>)	re   rf   rg   rh   r=   rb   r   rk   rl   rm   s   @r0   r   r      s    $;J Jr2   r   c                      ^  \ rS rSrU 4S jr  SS\R                  S\R                  S\\\R                  \R                  4      S\\	   S\
\   S\\R                     4S	 jjrS
rU =r$ )PixtralAttentionLayeri  c                    > [         TU ]  5         [        UR                  SS9U l        [        U5      U l        [        U5      U l        [        UR                  SS9U l	        g )Nh㈵>r   )
r<   r=   r   r   attention_normr   feed_forwardr   	attentionffn_normr   s     r0   r=   PixtralAttentionLayer.__init__  sP    ,V-?-?TJ&v.)&1&v'9'9tDr2   r   r~   r   r   r   r   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pXa-   nUnU R                  U5      nU R                  U5      nXa-   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r~   r   r    )r   r   r   r   )	rH   r   r~   r   r   r   residualr   outputss	            r0   rb   PixtralAttentionLayer.forward  s    $ !++M:&*nn '
') 3/	'

 '
# !0 m4))-8 0 "&Gr2   )r   r   r   r   )NN)re   rf   rg   rh   r=   r   r   r   r   r   r   r
   FloatTensorrb   rk   rl   rm   s   @r0   r   r     s    E LP,0'||' ' &eELL%,,,F&GH	'
 $D>' -.' 
u  	!' 'r2   r   c                      ^  \ rS rSrU 4S jr     SS\\R                     S\\\R                  \R                  4      S\\	   S\\	   S\\	   S\
\   S	\\\4   4S
 jjrSrU =r$ )PixtralTransformeriF  c                   > [         TU ]  5         Xl        [        R                  R                  5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     SU l        g )NF)r<   r=   rI   r   r   
ModuleListlayersrangenum_hidden_layersr$   r   gradient_checkpointing)rH   rI   r   rP   s      r0   r=   PixtralTransformer.__init__G  s\    hh))+v//0AKK4V<= 1&+#r2   r~   r   r   output_hidden_statesreturn_dictr   r   c                 >   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSn	Un
U R                   Ht  nU(       a  X4-   nU R
                  (       a1  U R                  (       a   U R                  UR                  U
UUU5      nOU" U
U4UUS.UD6nUS   n
U(       d  Ml  XS   4-   n	Mv     U(       a  X4-   nU(       d  [        S XU	4 5       5      $ [        XU	S9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embeddings which serve as input to the Transformer.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frd   r   ).0vs     r0   	<genexpr>-PixtralTransformer.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)last_hidden_stater   
attentions)rI   r   r   use_return_dictr   r   r   _gradient_checkpointing_func__call__r   r   )rH   inputs_embedsr~   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputss                r0   rb   PixtralTransformer.forwardO  s@   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[M#!/2B!B**t}} $ A A!**!"'%! !.!"! )<&7	!
 ! *!,M  !/3C2E!E/ )2  +.>>Ne]N$Seee+Vd
 	
r2   )rI   r   r   )NNNNN)re   rf   rg   rh   r=   r   r   r   r   r   r   r
   r   r   rb   rk   rl   rm   s   @r0   r   r   F  s    , 26KO,0/3&*H
 !.H
 &eELL%,,,F&GH	H

 $D>H
 'tnH
 d^H
 -.H
 
uo%	&H
 H
r2   r   c                   P    \ rS rSr\rSrSrSrSr	Sr
SrSrS/rSr
SrSrSr	S rSrg)	PixtralPreTrainedModeli  modelpixel_valuesTr   c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g g )Nr   )r   stdr8   )rI   initializer_rangerX   r   r   Conv2dr   datanormal_r   zero_r   fill_)rH   rz   r  s      r0   _init_weights$PixtralPreTrainedModel._init_weights  s    kk++fryy"))455MM&&CS&9{{&  &&( '//MM$$S) 0r2   r   N)re   rf   rg   rh   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modulesr  rk   r   r2   r0   r  r    sT    &L$O&*#"&!N01!N"&*r2   r  c                    UR                   nUR                  nUR                  S   n[        R                  " U5      R
                  n[        R                  " XD4XRUS9n[        R                  " U 5      R                  S5      n[        R                  " S/U S S -   5      R                  S5      n[        X5       H  u  pSXiU
2X24'   M     US S S S 2S S 24   R                  UR                  S   SSS5      nU$ )Nr   )
fill_valuerW   r9   r   r   )rW   r9   r   r   finfominfulltensorcumsumzipexpand)r&   r&  rW   r9   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartends              r0   generate_block_attention_maskr1    s    LLE]]Fll1oGKK""E**g/EW]^KLL!23::2>MllA3):3B)?#?@GGKO/9
,-#Iuy() : dD!Q./66v||A2rRKr2   c                      ^  \ rS rSrSrU 4S jrS r\\    SS\	R                  S\\	R                     S\\   S\\   S	\\   S
\\   S\\\4   4S jj5       5       rSrU =r$ )PixtralVisionModeli  vision_encoderc                 n  > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  UR                  UR                  SS9U l        UR                  U l        [        UR                  SS9U l
        [        U5      U l        [        U5      U l        U R                  5         g )NF)in_channelsout_channelskernel_sizestrider   r   r   )r<   r=   rI   r   r  num_channelsr   rC   
patch_convr   ln_prer   transformerr4   patch_positional_embedding	post_initr   s     r0   r=   PixtralVisionModel.__init__  s     ))++++))$$
 !++$V%7%7TB-f5*@*H'r2   c                     U R                   $ rd   )r;  r   s    r0   get_input_embeddings'PixtralVisionModel.get_input_embeddings  s    r2   r  image_sizesr   r   r   r   r   c           
         Uc  UR                   u  ppX4/U-  nU R                  U5      n[        X5       VVs/ s H1  u  pUSS US   U R                  -  2S US   U R                  -  24   PM3     nnn[        R
                  " U Vs/ s H  nUR                  S5      R                  PM      snSS9R                  S5      nU R                  U5      n[        XR                  R                  U R                  R                  -  S9nUUS'   U R                  UU5      n[        U Vs/ s H"  nUR                   S   UR                   S   -  PM$     snU5      nU R                  " U4UUUUS	S
.UD6$ s  snnf s  snf s  snf )N.r   r   r   )r'   r`   r   r   T)r~   r   r   r   r   )r   r;  r(  rC   r   r%   flattenTrs   r<  r1   rI   rB   r>  r1  r=  )rH   r  rD  r   r   r   argsr   r   r   r*   r+   patch_embedsembedr   r&   r   r`   r   r~   s                       r0   rb   PixtralVisionModel.forward  s    +7+=+=(J6"?+j8K |4  #<=
= #5$q'T__457U$q'T__:T7UUV= 	 
 yy:K!L:KQ!))A,..:K!LRST^^_`a{{<0 0)?)?4;;CYCY)Y
 ".~"==lLY60AB0A1QWWR[1772;&0ABL
 
) 3!5/
 
 	
+
 "M Cs   8E1%E7')E<)rI   r<  r;  r>  rC   r=  )NNNN)re   rf   rg   rh   r  r=   rB  r   r   r   r   r   r   r   r
   r   r   r   rb   rk   rl   rm   s   @r0   r3  r3    s    ("  /3/3,0&*-
ll-
 ell+-
 'tn	-

 $D>-
 d^-
 -.-
 
uo%	&-
  -
r2   r3  )Nr   )r   )1ri   collections.abcr   typingr   r   r   r   torch.utils.checkpointr   activationsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerre   r   r1   Moduler4   rq   ry   r   rD   r   r   r   r   r   r   r  r1  r3  __all__r   r2   r0   <module>rZ     su    $ ) )    ! B / 6 F & > > 6 
		H	% 0<RYY 0<h(F %II%<<% 
% <<	%
 U\\*% % %.M)ryy M)b "JRYY J(/BII /dQ
 Q
h *_ * *2  F
/ F
 F
R  !9
:r2   