o
    ZhO3                     @   s   d Z ddlmZ ddlmZmZmZmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZmZmZmZ ddlmZmZ dd	lmZ erHdd
lmZ eeZdefddZdd ZG dd deddZ G dd deddZ!G dd deZ"dgZ#dS )z
Processor class for IDEFICS2.
    )
accumulate)TYPE_CHECKINGListOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)ImagesKwargsProcessingKwargsProcessorMixinUnpack!_validate_images_text_input_order)
AddedToken	TextInput)logging)PreTokenizedInputreturnc                 C   s   t | to	| dS )Nhttp)
isinstancestr
startswith)val r   _/var/www/auris/lib/python3.10/site-packages/transformers/models/idefics2/processing_idefics2.pyis_url*   s   r   c                 C   s   t | pt| S N)r   r
   )elemr   r   r   is_image_or_image_url.   s   r    c                   @   s   e Zd ZU ee ed< dS )Idefics2ImagesKwargsimage_seq_lenN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r!   2   s   
 r!   F)totalc                   @   s(   e Zd ZU eed< ddddi dZdS )Idefics2ProcessorKwargsimages_kwargsTF)add_special_tokenspaddingZis_split_into_words)text_kwargsr*   N)r#   r$   r%   r!   r'   	_defaultsr   r   r   r   r)   6   s   
 
r)   c                
       s   e Zd ZdZddgZddgZdZdZ	ddede	e
 f fd
dZdd Z				ddeeee eee  f deedee ed f dee defddZdd Zdd Zedd Z  ZS )Idefics2Processora  
    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`Idefics2ImageProcessor`):
            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            config.perceiver_config.resampler_n_latents value for the model used.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizerr"   chat_templateZIdefics2ImageProcessorZAutoTokenizerN@   c                    s   |d u rt d|d u rt dt|ds<tddddj| _tddddj| _d	| j| jgi}|| || j| _n|j	| _|j| _|j| _td
ddd| _
|d	| j
gi || _t j|||d d S )Nz)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_tokenz<fake_token_around_image>FT)
normalizedZspecialz<image>Zadditional_special_tokensz<end_of_utterance>)r2   )
ValueErrorhasattrr   contentfake_image_tokenr4   r+   Zconvert_tokens_to_idsZimage_token_idZimage_boundary_tokenZend_of_utterance_tokenr"   super__init__)selfr0   r1   r"   r2   kwargsZtokens_to_add	__class__r   r   r;   \   s"   

zIdefics2Processor.__init__c                 C   sT   g }|D ]#}g }|D ]}t |r|| q
t|r!|t| q
|| q|S r   )r
   appendr   r   )r<   ZpromptsZprompt_imagespromptimagesr   r   r   r   _extract_images_from_promptsu   s   z.Idefics2Processor._extract_images_from_promptsrB   textr   r=   r   c              
      s  |du rdu rt dt|\}| jtfd| jji|}|d dd}|dur.|n| j}|d dd}g }	i }
|durt|t	rJ|g}nt|t
sZt|d t	sZt d	| j}| j}| ||  | }| jjrv|d
 }|d
9 }g }|D ] }|	|| |||}|| | | }|| qz| j|fi |d }| j||dgd |
| duratrggnltt
tfrtd r|durt|	tkrt d| dt|	 d| dt d	dgt
t|	   fddtt|	D n#gntt
tfs.td t
tfs.td d s.t ddd D }|durJ||	ksJt d|	 d| ddd D | jfi |d }|
| t|
|dS )a
  
        Processes the input prompts and returns a BatchEncoding.

        Example:

        ```python
        >>> import requests
        >>> from transformers import Idefics2Processor
        >>> from transformers.image_utils import load_image

        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example

        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"

        >>> image1, image2 = load_image(url1), load_image(url2)
        >>> images = [[image1], [image2]]

        >>> text = [
        ...     "<image>In this image, we see",
        ...     "bla bla bla<image>",
        ... ]
        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
        >>> input_ids = outputs.input_ids
        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
        >>> print(input_tokens)
        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
        ```

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
            text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).

                Wherever an image token, `<image>` is encountered it is expanded to
                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
            return_tensors (`Union[str, TensorType]`, *optional*):
                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                information.

        Nz+You must provide either `text` or `images`.Ztokenizer_init_kwargsr*   r"   r-   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings   image)Z
modalitieszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.c                    s$   g | ]} |  |d    qS )   r   ).0iZcumsum_images_in_textrB   r   r   
<listcomp>   s    z.Idefics2Processor.__call__.<locals>.<listcomp>zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 S      g | ]}t |qS r   )lenrJ   sampler   r   r   rM          z!The number of images in the text z and images  z should be the same.c                 S   s   g | ]	}d d |D qS )c                 S   rN   r   )r   )rJ   Zimr   r   r   rM     rR   z9Idefics2Processor.__call__.<locals>.<listcomp>.<listcomp>r   rP   r   r   r   rM     s    )Ztensor_type)r6   r   Z_merge_kwargsr)   r1   Zinit_kwargspopr"   r   r   listr9   r4   r0   Zdo_image_splittingr@   countreplaceZ_check_special_mm_tokensupdater    tuplesumrO   r   ranger   )r<   rB   rD   ZaudioZvideosr=   Zoutput_kwargsr"   rE   Zn_images_in_textZinputsr9   r4   Z	image_strZprompt_stringsrQ   Ztext_inputsZn_images_in_imagesZimage_inputsr   rL   r   __call__   s   6






zIdefics2Processor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r1   batch_decoder<   argsr=   r   r   r   r]        zIdefics2Processor.batch_decodec                 O   r\   )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r1   decoder^   r   r   r   ra     r`   zIdefics2Processor.decodec                 C   s"   | j j}| jj}tt|| S r   )r1   model_input_namesr0   rT   dictfromkeys)r<   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   rb     s   z#Idefics2Processor.model_input_names)Nr3   N)NNNN)r#   r$   r%   __doc__
attributesZvalid_kwargsZimage_processor_classZtokenizer_classr&   r   r   r;   rC   r   r	   r   r   r   r)   r   r[   r]   ra   propertyrb   __classcell__r   r   r>   r   r/   C   s>    
 r/   N)$re   	itertoolsr   typingr   r   r   r   Zfeature_extraction_utilsr   Zimage_utilsr	   r
   r   Zprocessing_utilsr   r   r   r   r   Ztokenization_utils_baser   r   utilsr   r   Z
get_loggerr#   loggerboolr   r    r!   r)   r/   __all__r   r   r   r   <module>   s$   
 
`