o
    ZhtC                     @   s
  d Z ddlmZmZmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ eeZeG dd deZG dd deeZeddG dd deZeddG dd dee
Z g dZ!dS )zPyTorch Fuyu model.    )ListOptionalTupleUnionN)nn   )GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)
LossKwargsauto_docstringcan_return_tuplelogging   )
FuyuConfigc                   @   s8   e Zd ZeZdZdZdZdZdZ	dZ
g ZdZdd ZdS )FuyuPreTrainedModelZfuyuTpast_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Ng        )meanstd)configZinitializer_range
isinstancer   LinearweightdataZnormal_biasZzero_Z	Embeddingpadding_idx)selfmoduler    r    U/var/www/auris/lib/python3.10/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights/   s   

z!FuyuPreTrainedModel._init_weightsN)__name__
__module____qualname__r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_attention_backendZ_supports_flash_attn_2Z_supports_sdpaZ_supports_flex_attnZ_no_split_modulesZ_skip_keys_device_placementr"   r    r    r    r!   r   #   s    r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r#   r$   r%   r    r    r    r!   r&   ;   s    r&   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )Zcustom_introc                       s   e Zd ZddiZdef fddZdd Zdd	 Zd
ej	de
ej	 dej	dej	fddZdejfddZe											d!dejdej	dej	deej	 deej dee
ej  deej dee dee dee dee deeef fdd Z  ZS )"	FuyuModelzlanguage_model.modellanguage_modelr   c                    s\   t  | |j| _|jj| _t|j| _t	
|j|j |j |j| _d| _|   d S )NF)super__init__Zpad_token_idr   text_config
vocab_sizer   Zfrom_configr(   r   r   Z
patch_sizeZnum_channelshidden_sizevision_embed_tokensZgradient_checkpointing	post_initr   r   	__class__r    r!   r*   F   s   
zFuyuModel.__init__c                 C   
   | j  S N)r(   get_input_embeddingsr   r    r    r!   r5   S      
zFuyuModel.get_input_embeddingsc                 C      | j | d S r4   )r(   set_input_embeddingsr   valuer    r    r!   r9   V      zFuyuModel.set_input_embeddingsword_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc              	   C   s   |j d t|kstdt|d|j d | }t|j d D ]A}tj|| dkddd }|| | }|j d || j d krVtd|| j d|j d| d	|| | |j|||f< q#|S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchZnonzerotodevice)r   r=   r>   r?   Zoutput_embeddingsZ	batch_idxZdst_indicesZsrc_indicesr    r    r!   gather_continuous_embeddingsY   s(   z&FuyuModel.gather_continuous_embeddingspixel_valuesc                    s    fdd|D }|S )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        c                    s(   g | ]}  | j jjd qS )r   )r.   rI   r   dtypeZsqueeze).0patchr6   r    r!   
<listcomp>   s    z0FuyuModel.get_image_features.<locals>.<listcomp>r    )r   rL   patch_embeddingsr    r6   r!   get_image_features   s   
zFuyuModel.get_image_featuresN	input_idsimage_patchesimage_patches_indicesattention_maskposition_idsr   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 K   sh  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd|}d}|durb|d d jd }|| }|du r|durm|jn|j}tj	||| tj
|d}|d}|du r| j |}|dur|du r| |}| j|||d}| jd	|||||	|
||d|}|S )
a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr      )rM   rJ   )r=   r>   r?   )rX   rV   rW   r   rZ   r[   rY   r\   r    )r   rZ   r[   rY   use_return_dictrE   rC   rJ   rH   ZarangelongZ	unsqueezer(   r5   rR   rK   )r   rS   rT   rU   rV   rW   r   rX   rY   rZ   r[   r\   kwargsZ
batch_sizeZ
seq_length_Zseq_length_with_pastZpast_key_values_lengthrJ   rQ   outputsr    r    r!   forward   sZ   

	zFuyuModel.forward)NNNNNNNNNNN)r#   r$   r%   _checkpoint_conversion_mappingr   r*   r5   r9   rH   Tensorr   rK   FloatTensorrR   r   
LongTensorr   boolr   r   r
   rc   __classcell__r    r    r1   r!   r'   >   sh    
,	

r'   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                !       s2  e Zd ZddddZdgZdef fddZd	d
 Zdd Zdd Z	dd Z
dd Zdd Zee													d+dejdejdejdeej deej deeej  deej dee deej d ee d!ee d"ee d#ee d$eeef fd%d&Z					d, fd'd(	Zed)d* Z  ZS )-FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r)   r*   r'   modelr   r   r+   r-   r,   rk   r/   r0   r1   r    r!   r*      s   
zFuyuForCausalLM.__init__c                 C   r3   r4   )rl   r5   r6   r    r    r!   r5      r7   z$FuyuForCausalLM.get_input_embeddingsc                 C   r8   r4   )rl   r9   r:   r    r    r!   r9      r<   z$FuyuForCausalLM.set_input_embeddingsc                 C   s   | j S r4   rk   r6   r    r    r!   get_output_embeddings   s   z%FuyuForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r4   rm   )r   Znew_embeddingsr    r    r!   set_output_embeddings   r7   z%FuyuForCausalLM.set_output_embeddingsc                 C   r8   r4   )rl   set_decoder)r   decoderr    r    r!   rp     r<   zFuyuForCausalLM.set_decoderc                 C   r3   r4   )rl   get_decoderr6   r    r    r!   rr     r7   zFuyuForCausalLM.get_decoderNr   rS   rT   rU   rV   rW   r   rX   rY   labelsrZ   r[   r\   logits_to_keepr@   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| j||||||||
||dd}|d }t|trGt| dn|}| 	|dd|ddf }d}|	durl| j
d||	| j jjd|}t|||j|j|jdS )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rS   rT   rU   rX   rV   rW   r   rZ   r[   rY   r\   r   )logitsrs   r,   )lossru   r   hidden_states
attentionsr    )r   rZ   r[   rY   r^   rl   r   intslicerk   Zloss_functionr+   r,   r
   r   rw   rx   )r   rS   rT   rU   rV   rW   r   rX   rY   rs   rZ   r[   r\   rt   r`   rb   rw   Zslice_indicesru   rv   r    r    r!   rc     sF   5zFuyuForCausalLM.forwardc           	         s<   t  j|f|||||d|}|d urd |d< d |d< |S )N)r   rV   rX   rT   rU   rU   rT   )r)   prepare_inputs_for_generation)	r   rS   r   rV   rX   rT   rU   r`   Zmodel_inputsr1   r    r!   r{   f  s   
z-FuyuForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr    c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectrI   rJ   )rN   Z
past_statebeam_idxr    r!   	<genexpr>  s   " z1FuyuForCausalLM._reorder_cache.<locals>.<genexpr>)tuple)r   r}   Zreordered_pastZ
layer_pastr    r|   r!   _reorder_cache  s   zFuyuForCausalLM._reorder_cache)NNNNNNNNNNNNr   )NNNNN)r#   r$   r%   rd   Z_tied_weights_keysr   r*   r5   r9   rn   ro   rp   rr   r   r   rH   rg   re   r   r   rf   rh   ry   r   r   r
   rc   r{   staticmethodr   ri   r    r    r1   r!   rj      s    	

`rj   )rj   r   r'   )"__doc__typingr   r   r   r   rH   Ztorch.utils.checkpointr   Z
generationr   Zmodeling_flash_attention_utilsr	   Zmodeling_outputsr
   Zmodeling_utilsr   Zmodels.auto.modeling_autor   utilsr   r   r   r   Zconfiguration_fuyur   Z
get_loggerr#   loggerr   r&   r'   rj   __all__r    r    r    r!   <module>   s6   
   &