o
    Zh'T                     @   s  d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ eeZdd Z G dd de	j!Z"dd Z#d2ddZ$	d3de	j!dej%dej%dej%deej% de&de&fd d!Z'G d"d# d#e	j!Z(G d$d% d%e	j!Z)G d&d' d'e	j!Z*G d(d) d)e	j!Z+G d*d+ d+e	j!Z,eG d,d- d-eZ-d.d/ Z.eG d0d1 d1e-Z/d1d-gZ0dS )4zPyTorch Pixtral model.    )Callable)OptionalTupleUnionN)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc           
      C   s   g }| D ];}|j dd  \}}tjt|t|dd}tj|dddddd\}}|| | }	||	d d df  qt|S )NZij)Zindexingdim   r   )	shapetorchZmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_widthZ	positionspatchheightwidthZmeshZh_gridZv_gridZids r&   [/var/www/auris/lib/python3.10/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgrid%   s   "
r(   c                       s6   e Zd ZdZd fdd	Ze edd Z  Z	S )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    Nc           
         s  t    d| _|j| _|j| _|j|j }d| jt	
d| jd | j   }t	j
||jd}t	j
||jd}t	||d d d  }t	||dd d  }t	j|d d d d d f d|d|d d d d d f |ddgddd| jd }	| jd	t	j|	|	fddd
d d S )Ndefault      ?r   r   )devicer   r   r   inv_freqF)
persistent)super__init__Z	rope_typehead_dimr   Z
rope_thetabase
image_size
patch_sizer   r   floatr,   outerr    repeatr   Zregister_buffer)
selfconfigr,   Zmax_patches_per_sidefreqshwZfreqs_hZfreqs_wr-   	__class__r&   r'   r0   <   s&   
$"
zPixtralRotaryEmbedding.__init__c                 C   s   | j | }t|jjtr|jjdkr|jjnd}tj|dd |}| }| }W d    n1 s4w   Y  |j	|j
d|j	|j
dfS )NZmpscpuF)device_typeenabled)dtype)r-   
isinstancer,   typestrr   ZautocastcossintorB   )r8   xposition_idsr:   r@   ZembrF   rG   r&   r&   r'   forwardU   s   
&
zPixtralRotaryEmbedding.forwardN)
__name__
__module____qualname____doc__r0   r   Zno_gradr   rK   __classcell__r&   r&   r=   r'   r)   0   s    r)   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r    )rI   x1Zx2r&   r&   r'   rotate_halfd   s   rS   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerS   )qkrF   rG   rJ   unsqueeze_dimZq_embedZk_embedr&   r&   r'   apply_rotary_pos_embk   s
   

rX           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   rB   )ptrainingr   r   )r   matmul	transposer   Z
functionalZsoftmaxfloat32rH   rB   r`   rb   
contiguous)
rZ   r[   r\   r]   r^   r_   r`   kwargsattn_weightsattn_outputr&   r&   r'   eager_attention_forward   s   
rj   c                       sz   e Zd ZdZ fddZ			ddejdeej deeejejf  d	ee	 d
e
e deejeej f fddZ  ZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    s   t    || _|j| _|j| _| j| j | _d| _| jd | _	d| _|j
| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFg      ࿩bias)r/   r0   r9   hidden_sizeZ	embed_dimZnum_attention_heads	num_headsr1   Z	is_causalr_   Zattention_dropoutr`   r   Lineark_projv_projq_projo_projr8   r9   r=   r&   r'   r0      s   
zPixtralAttention.__init__NFhidden_statesr^   position_embeddingsoutput_attentionsrg   returnc                 K   sZ  |  \}}}| |}	| |}
| |}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|\}}t|	|
||dd\}	}
t	}| j
jdkrk| j
jdkre|retd nt| j
j }| j
jdkr|d	 j|jd
d|d	< d}|| |	|
||f| jsdn| j| jd|\}}|||d }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rW   eagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.Zflash_attention_2rJ   T)Znon_blockingNrY   )r`   r_   r   )sizers   rq   rr   viewro   r1   rd   rX   rj   r9   Z_attn_implementationloggerZwarning_oncer   rH   r,   rb   r`   r_   r   rf   rt   )r8   rv   r^   rw   rx   rg   
batch_sizeZpatches_Zquery_statesZ
key_statesZvalue_statesrF   rG   Zattention_interfaceri   rh   r&   r&   r'   rK      sH   





zPixtralAttention.forward)NNF)rM   rN   rO   rP   r0   r   Tensorr   r   boolr   r	   rK   rQ   r&   r&   r=   r'   rk      s&    rk   c                       s$   e Zd Z fddZdd Z  ZS )
PixtralMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S )NFrl   )r/   r0   r9   rn   Zintermediate_sizer   rp   	gate_projup_proj	down_projr   Z
hidden_actact_fnru   r=   r&   r'   r0      s   
zPixtralMLP.__init__c                 C   s$   |  | | || | }|S rL   )r   r   r   r   )r8   rI   r   r&   r&   r'   rK      s    zPixtralMLP.forward)rM   rN   rO   r0   rK   rQ   r&   r&   r=   r'   r      s    
r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PixtralRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r/   r0   r   	Parameterr   Zonesweightvariance_epsilon)r8   rn   epsr=   r&   r'   r0     s   

zPixtralRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r   T)Zkeepdim)	rB   rH   r   re   powmeanZrsqrtr   r   )r8   rv   Zinput_dtypeZvariancer&   r&   r'   rK   	  s
   zPixtralRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   r   r   r8   r&   r&   r'   
extra_repr  s   zPixtralRMSNorm.extra_repr)r   )rM   rN   rO   r0   rK   r   rQ   r&   r&   r=   r'   r      s    r   c                       sf   e Zd Z fddZ		ddejdejdeeejejf  dee de	e
 d	eej fd
dZ  ZS )PixtralAttentionLayerc                    sB   t    t|jdd| _t|| _t|| _t|jdd| _	d S )Nh㈵>r   )
r/   r0   r   rn   attention_normr   feed_forwardrk   	attentionffn_normru   r=   r&   r'   r0     s
   


zPixtralAttentionLayer.__init__Nrv   r^   rw   rx   rg   ry   c           	      K   sl   |}|  |}| jd||||d|\}}|| }|}| |}| |}|| }|f}|r4||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rv   r^   rw   rx   Nr&   )r   r   r   r   )	r8   rv   r^   rw   rx   rg   Zresidualrh   Zoutputsr&   r&   r'   rK     s&   




zPixtralAttentionLayer.forward)NN)rM   rN   rO   r0   r   r   r   r   r   r   r	   ZFloatTensorrK   rQ   r&   r&   r=   r'   r     s"    r   c                       s|   e Zd Z fddZ					ddeej deeejejf  dee dee dee d	e	e
 d
eeef fddZ  ZS )PixtralTransformerc                    sF   t    || _tj | _t|jD ]
}| j	t
| qd| _d S )NF)r/   r0   r9   r   r   Z
ModuleListlayersrangeZnum_hidden_layersr   r   gradient_checkpointing)r8   r9   r   r=   r&   r'   r0   G  s   

zPixtralTransformer.__init__Nr^   rw   rx   output_hidden_statesreturn_dictrg   ry   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}	|}
| jD ]3}|r8||
f }| jrI| jrI| |j|
|||}n||
|f||d|}|d }
|rb|	|d f }	q/|rj||
f }|sxt	dd |
||	fD S t
|
||	dS )	av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr&   )rw   rx   r   r   c                 s   s    | ]	}|d ur|V  qd S rL   r&   ).0vr&   r&   r'   	<genexpr>  s    z-PixtralTransformer.forward.<locals>.<genexpr>)Zlast_hidden_staterv   Z
attentions)r9   rx   r   Zuse_return_dictr   r   rb   Z_gradient_checkpointing_func__call__r   r
   )r8   Zinputs_embedsr^   rw   rx   r   r   rg   Zencoder_statesZall_attentionsrv   Zencoder_layerZlayer_outputsr&   r&   r'   rK   O  sL   


zPixtralTransformer.forward)NNNNN)rM   rN   rO   r0   r   r   r   r   r   r   r	   r   r
   rK   rQ   r&   r&   r=   r'   r   F  s,    
	r   c                   @   sJ   e Zd ZeZdZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZdd ZdS )PixtralPreTrainedModelmodelpixel_valuesTr   c                 C   sj   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tr3|jjd d S d S )NrY   )r   stdr+   )r9   Zinitializer_rangerC   r   rp   Conv2dr   dataZnormal_rm   Zzero_r   Zfill_)r8   rZ   r   r&   r&   r'   _init_weights  s   

z$PixtralPreTrainedModel._init_weightsN)rM   rN   rO   r   Zconfig_classbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_attention_backendZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_no_split_modulesr   r&   r&   r&   r'   r     s    r   c                 C   s   |j }|j}|jd }t|j}tj||f|||d}t| d}tdg| d d  d}t	||D ]\}	}
d||	|
|	|
f< q8|d d d d d d f 
|jd ddd}|S )Nr   )Z
fill_valuerB   r,   r   r   )rB   r,   r   r   ZfinfominfulltensorZcumsumzipexpand)r!   r   rB   r,   Zseq_lenZd_minZcausal_maskZblock_end_idxZblock_start_idxstartendr&   r&   r'   generate_block_attention_mask  s   
*r   c                       s   e Zd ZdZ fddZdd Zee				ddej	de
ej	 d	e
e d
e
e de
e dee deeef fddZ  ZS )PixtralVisionModelZvision_encoderc                    sh   t  | || _tj|j|j|j|jdd| _|j| _t	|jdd| _
t|| _t|| _|   d S )NF)Zin_channelsZout_channelsZkernel_sizeZstriderm   r   r   )r/   r0   r9   r   r   Znum_channelsrn   r4   
patch_convr   ln_prer   transformerr)   patch_positional_embeddingZ	post_initru   r=   r&   r'   r0     s   

zPixtralVisionModel.__init__c                 C   s   | j S rL   )r   r   r&   r&   r'   get_input_embeddings  s   z'PixtralVisionModel.get_input_embeddingsNr   image_sizesr   rx   r   rg   ry   c                    s   |d u r|j \}}	}
}|
|fg| } |} fddt||D }tjdd |D ddd} |}t| jj	 jj
 d}||d<  ||}tdd |D |} j|f||||d	d
|S )Nc                    s:   g | ]\}}|d d|d  j  d|d  j  f qS ).Nr   r   )r4   )r   Zembedr{   r   r&   r'   
<listcomp>  s    (z.PixtralVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS )r   )flattenTr   ra   r&   r&   r'   r     s    r   r   )r"   rJ   c                 S   s    g | ]}|j d  |j d  qS )r   r   )r   r   r&   r&   r'   r      s     T)r^   rw   r   rx   r   )r   r   r   r   r    rT   r   r(   r9   r3   r4   r   r   r   )r8   r   r   r   rx   r   argsrg   r~   r   r$   r%   Zpatch_embedsr!   rJ   rw   r^   r&   r   r'   rK     s8   


zPixtralVisionModel.forward)NNNN)rM   rN   rO   r   r0   r   r   r   r   r   r   r   r   r	   r   r   r
   rK   rQ   r&   r&   r=   r'   r     s2    
	r   )Nr   )rY   )1rP   collections.abcr   typingr   r   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_flash_attention_utilsr	   Zmodeling_outputsr
   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zconfiguration_pixtralr   Z
get_loggerrM   r}   r(   Moduler)   rS   rX   r   r5   rj   rk   r   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   s\   
4
#
Q2TI