o
    Zh_r                     @   sx  d Z ddlmZ ddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z! ddl"m#Z# e$e%Z&eG dd deZ'eG dd deZ(G dd de
j)Z*eG dd deZ+eddG dd de+Z,G dd deeZ-ed dG d!d" d"e+eZ.g d#Z/dS )$zPyTorch PaliGemmamodel.    )	dataclass)ListOptionalTupleUnionN)nn   )CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPast)PreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )	AutoModel   )PaliGemmaConfigc                   @   s$   e Zd ZU dZdZeej ed< dS )PaligemmaModelOutputWithPasta  
    Base class for Paligemma outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__ r$   r$   _/var/www/auris/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   &   s   
 r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej ef  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	PaliGemmaCausalLMOutputWithPasta  
    Base class for PaliGemma causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r    r'   r   r!   r"   r#   r(   r)   r   r   r	   r*   r   r+   r   r$   r$   r$   r%   r&   G   s   
 r&   c                       s*   e Zd Zdef fddZdd Z  ZS )PaliGemmaMultiModalProjectorconfigc                    s(   t    tj|jj|jjdd| _d S )NTbias)super__init__r   Linearvision_confighidden_sizeZprojection_dimlinearselfr-   	__class__r$   r%   r1   p   s   
z%PaliGemmaMultiModalProjector.__init__c                 C   s   |  |}|S N)r5   )r7   image_featuresr*   r$   r$   r%   forwardt   s   
z$PaliGemmaMultiModalProjector.forward)r   r   r   r   r1   r<   __classcell__r$   r$   r8   r%   r,   o   s    r,   c                   @   sB   e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZdd ZdS )PaliGemmaPreTrainedModel Tr,   r)   c                 C   sV   t | jd| j j}t|tjr'|jjj	d|d |j
d ur)|j
j  d S d S d S )Ninitializer_range        )meanstd)getattrr-   Zget_text_configr@   
isinstancer   r2   weightdataZnormal_r/   Zzero_)r7   modulerC   r$   r$   r%   _init_weights   s   
z&PaliGemmaPreTrainedModel._init_weightsN)r   r   r   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_cache_classZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_flash_attn_2Z_supports_sdpaZ_supports_attention_backendrI   r$   r$   r$   r%   r>   z   s    r>   z{
    The Base Paligemma model which consists of a vision backbone and a language model withou language modeling head.,
    )Zcustom_introc                #       s(  e Zd ZddiZdef fddZdd Zdd	 Z	
	
	
	
	
d!dee	 fddZ
dejfddZee	
	
	
	
	
	
	
	
	
	
	
	
	
d"dejdejdeej deej deeeej ef  deej deej deej deej dee	 dee	 dee	 dee	 dee deeef fdd Z  ZS )#PaliGemmaModelzlanguage_model.modellanguage_modelr-   c                    sj   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd ur,| jjnd| _|   d S )N)r-   )r0   r1   r   Zfrom_configr3   vision_towerr,   multi_modal_projectortext_config
vocab_sizerK   r-   Zpad_token_id	post_init)r7   r-   rK   r8   r$   r%   r1      s   

zPaliGemmaModel.__init__c                 C   
   | j  S r:   )rK   get_input_embeddingsr7   r$   r$   r%   rS         
z#PaliGemmaModel.get_input_embeddingsc                 C      | j | d S r:   )rK   set_input_embeddingsr7   valuer$   r$   r%   rW         z#PaliGemmaModel.set_input_embeddingsNis_trainingc                 C   s  | j jjdkr|d urd|v r|S d S |d ur|n| j}t|t}t| jj	}|d u r.|}|j
d d \}	}
|r>| }nt|trH| }nt|tjrS|j
d n|d |
 d }|d urg| dkrg|S tj|
|f|| j|jd}|
dkr|rtj|dd	}n
d|d d d |
f< |tj||jd
|ddk9 }|d d d d d d f |	ddd}|d ur>| }|j
d }|r|d u rtd|d d d d d d d |f |d d d d d d f |jdkd|d d d d d d d |f< |d d d d d d d |f |d d d d d d f |j }|dk}|d d d d d d d |f |||d d d d d d d |f< |S )NZflash_attention_2rA   r   rL   r   r      Z
fill_valuedtypedeviceZdiagonalr_   z/Token type ids must be provided during training)r-   rO   Z_attn_implementationZtrainingrE   r   r!   finfor^   minshapeZget_max_cache_shaper
   Tensordimfullr_   triuarangereshapeexpandclone
ValueErrormasked_fillto)r7   attention_masktoken_type_idsr)   cache_positioninput_tensorr[   Zusing_static_cache	min_dtypeZinputs_lead_dimsequence_lengthtarget_lengthcausal_maskmask_lengthpadding_maskr$   r$   r%   _update_causal_mask   sX   	




 $

 $ @  z"PaliGemmaModel._update_causal_maskpixel_valuesc                 C   s0   |  |}|j}| |}|| jjjd  }|S )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        g      ?)rM   last_hidden_staterN   r-   rO   r4   )r7   r{   Zimage_outputsZselected_image_featurer;   r$   r$   r%   get_image_features   s
   


z!PaliGemmaModel.get_image_features	input_idsrp   position_idsr)   rq   rr   inputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictkwargsreturnc                 K   s:  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|duo1|	du}|durL| jj| jkrL|| jjk}| }d||< n|}|du rX|  |}|du rt|durd|	 nd}t
j|||jd  |jd}|du r|dd }|dur| |}|du r||  t
j| jjt
j|jdk}n|| jjkd}|||j}t s||  | kr|jddjddd }t d	| d
|jd |jd   d||j|j}|||}| ||||||}| jd|||||
||d|d	|}t|j|j|j|j|dur|dS ddS )i  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   ra   )r^   r_   rL   )rf   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	rp   r   r)   r   r   r   r   r   rr   )r|   r)   r*   r+   r   r$   ) rm   r-   r   r   use_return_dictZimage_token_idrP   rl   rS   Zget_seq_lengthr!   ri   rd   r_   Z	unsqueezer}   ZtensorlongZ	expand_asro   r   Znumelsumr^   Zmasked_scatterrz   rK   r   r|   r)   r*   r+   )r7   r~   r{   rp   r   r)   rq   rr   r   r   r   r   r   r   r   r[   Zspecial_image_maskZllm_input_idsZpast_seen_tokensr;   Zimage_tokens_in_textrw   outputsr$   r$   r%   r<     s   /


zPaliGemmaModel.forward)NNNNN)NNNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r1   rS   rW   r   boolrz   r!   r"   r}   r   r   
LongTensorre   r   r   r	   r   r   r   r   r<   r=   r$   r$   r8   r%   rJ      sx    
D	

rJ   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r   r   r   r$   r$   r$   r%   r     s    r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                %       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zdd Z	dd Z
edd Zedd Zedd Zee														d5dejdejdeej deej deeeej ef  deej d eej d!eej d"eej d#ee d$ee d%ee d&ee d'eeejf d(ee d)eeef f d*d+Z								,		d6 fd-d.	Z e!dejd/ed0ed1ej"d ejd2efd3d4Z#  Z$S )7!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr-   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr.   )r0   r1   rJ   modelr   r2   rO   r4   rP   r   rQ   r6   r8   r$   r%   r1     s   
z*PaliGemmaForConditionalGeneration.__init__c                 C   rR   r:   )r   rS   rT   r$   r$   r%   rS     rU   z6PaliGemmaForConditionalGeneration.get_input_embeddingsc                 C   rV   r:   )r   rW   rX   r$   r$   r%   rW     rZ   z6PaliGemmaForConditionalGeneration.set_input_embeddingsc                 C   s   | j S r:   r   rT   r$   r$   r%   get_output_embeddings  s   z7PaliGemmaForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S r:   r   )r7   Znew_embeddingsr$   r$   r%   set_output_embeddings  rU   z7PaliGemmaForConditionalGeneration.set_output_embeddingsc                 C      | j jS r:   )r   rK   rT   r$   r$   r%   rK        z0PaliGemmaForConditionalGeneration.language_modelc                 C   r   r:   )r   rM   rT   r$   r$   r%   rM     r   z.PaliGemmaForConditionalGeneration.vision_towerc                 C   r   r:   )r   rN   rT   r$   r$   r%   rN     r   z7PaliGemmaForConditionalGeneration.multi_modal_projectorNr   r~   r{   rp   r   r)   rq   rr   r   r   r   r   r   r   logits_to_keepr   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||d|d|}|d }t|trCt| dn|}| |dd|ddf }d}|	durh| j	d||	| j j
jd|}t|||j|j|j|jdS )r   NT)r~   r{   rq   rp   r   r)   r   r   r   r   r   r   rr   r   )r(   r   rP   )r'   r(   r)   r*   r+   r   r$   )r-   r   r   r   r   rE   intslicer   Zloss_functionrO   rP   r&   r)   r*   r+   r   )r7   r~   r{   rp   r   r)   rq   rr   r   r   r   r   r   r   r   r   r   r*   Zslice_indicesr(   r'   r$   r$   r%   r<     sN   /z)PaliGemmaForConditionalGeneration.forwardTc                    s   t  j|f||||||	|
|d|}|dd ur"|d  d7  < |d dkr,||d< |d uo3|d u}|d dkrVt|trV|d urE|n|}| j||||||}||d< |S )N)r)   r   rp   r   rr   r   r   rq   r   r   r   r{   rp   )r0   prepare_inputs_for_generationgetrE   r
   r   rz   )r7   r~   r)   r   rr   r   r{   rp   rq   r   r   r   r   Zmodel_inputsr[   rs   rw   r8   r$   r%   r   
  s4   
z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generationru   rv   r^   
batch_sizec                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr\   r]   r   r`   ra   rL   r   )rf   r!   rb   rc   rg   r_   rh   ri   rj   rk   rl   rd   ro   rn   )rp   ru   rv   r^   rr   r   r   rw   rt   rx   ry   r$   r$   r%   5_prepare_4d_causal_attention_mask_with_cache_position8  s,    $
6  zWPaliGemmaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNr   )
NNNNNNNTNN)%r   r   r   r   Z_tied_weights_keysr   r1   rS   rW   r   r   propertyrK   rM   rN   r   r   r!   r   r"   r   re   r   r   r	   r   r   r   r   r   r&   r<   r   staticmethodr^   r   r=   r$   r$   r8   r%   r     s    


	

[.r   )r   r>   rJ   )0r    dataclassesr   typingr   r   r   r   r!   Ztorch.utils.checkpointr   Zcache_utilsr	   r
   r   Z
generationr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   r   r   r   r   autor   Zconfiguration_paligemmar   Z
get_loggerr   loggerr   r&   Moduler,   r>   rJ   r   r   __all__r$   r$   r$   r%   <module>   sF    
 ' i j