o
    Zh<]                     @   s0  d Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZmZ dd	lmZmZ dd
lmZ e rOddlZe rVddlZdZG dd deddZ G dd deddZ!G dd deddZ"d"ddZ#dd Z$dd Z%dd Z&dd Z'G d d! d!eZ(d!gZ)dS )#z
Processor class for IDEFICS.
    )CallableDictListOptionalUnion)urlparse   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack!_validate_images_text_input_order)PreTokenizedInput	TextInput)is_tf_availableis_torch_available)deprecate_kwargN<image>c                   @   s^   e Zd ZU ee ed< eeeef  ed< ee	e
ee
 f  ed< ee	e
ee
 f  ed< dS )IdeficsImagesKwargsZ	transform
image_sizeZ
image_meanZ	image_stdN)__name__
__module____qualname__r   r   __annotations__r   strintr   floatr    r    r    ]/var/www/auris/lib/python3.10/site-packages/transformers/models/idefics/processing_idefics.pyr   .   s
   
 r   F)totalc                   @   s&   e Zd ZU ee ed< ee ed< dS )IdeficsTextKwargsadd_eos_tokenadd_end_of_utterance_tokenN)r   r   r   r   boolr   r    r    r    r!   r#   5   s   
 r#   c                   @   s6   e Zd ZU eed< eed< ddddi ddidZd	S )
IdeficsProcessorKwargstext_kwargsimages_kwargsFlongest)Zadd_special_tokenspaddingr$   return_tensorspt)r(   r)   Zcommon_kwargsN)r   r   r   r#   r   r   	_defaultsr    r    r    r!   r'   :   s   
 
r'   c                 C   s   |dkr|dkrd| | |k< n|dkrt | |kd| } |dkr;| dk}d| |< tjjj| |d}d||d d f< |S |dkrct | d}t |d| } t j| |d}t |d}t |t ||}|S )Nr/   r-   tfr   num_classes)depth)	r0   wheretorchnnZ
functionalZone_hotequalZexpand_dimsZ
zeros_like)Zincremental_maskr,   r2   Z	negativesZ	attn_maskZnegatives_expandedr    r    r!   $incremental_to_binary_attention_maskI   s$   	r8   c                 C   s(   |dkr	t | |S |dkrt| |S d S )Nr-   r0   ),image_attention_mask_for_packed_input_ids_pt,image_attention_mask_for_packed_input_ids_tf)	input_ids	tokenizerr,   r    r    r!   )image_attention_mask_for_packed_input_idsc   s
   

r=   c                 C   sv  t j| dd}t j| dd}|t}|j}t| dD ]6}d}d}t| | D ])\}	}
|
|kr>|d7 }||| |	< d}n||| |	< |rLd|| |	< |
|krRd}q)qt| dD ][}d}d}t| | dd ddD ]-}	| | |	 }
|
|kr|d7 }||| |	< d}n||| |	< |
|krd}|rd|| |	< qn|| dk}|| |  |8  < || |  d9  < q[||fS )Nr/   )Z
fill_valuer   F   T)r5   Z	full_likeconvert_tokens_to_idsIMAGE_TOKENeos_token_idrangesize	enumerate)r;   r<   image_attention_masknext_image_attention_maskimage_token_ideod_token_id	batch_idxcountseen_eodidxtoken_idZnon_negative_indicesr    r    r!   r9   j   sL   
r9   c                 C   s.  | t}|j}t| d }tt| d}tt| d}t|D ]m}d}d}	t| d }
t|
d ddD ]W}| ||f  }||krc|d7 }||gg}|g}t|||}t|||}n||kr||	s|d}	d}||gg}|g}t|||}|	r||kr||gg}dg}t|||}q:q%||fS )Nr   r/   Fr>   T)	r?   r@   rA   r0   shapefillrB   numpytensor_scatter_nd_update)r;   r<   rG   rH   Z
batch_sizerE   rF   rI   rJ   rK   Z
seq_lengthrL   rM   indicesupdatesr    r    r!   r:      s<   



r:   c                 C   s$   d| v rdS t | }t|j|jgS )zChecks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url F)r   allschemenetloc)stringresultr    r    r!   is_url   s   rZ   c                       s   e Zd ZdZddgZddgZdZdZd fd
d	Ze	ddddd				dde
eee eee eee  f de
eeee ee eee  eee  f dee defddZdd Zdd Zedd Z  ZS )IdeficsProcessorah  
    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`IdeficsImageProcessor`):
            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
        tokenizer (`LlamaTokenizerFast`):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        image_size (`int`, *optional*, defaults to 224):
            Image size (assuming a square image)
        add_end_of_utterance_token (`str`, *optional*):
            The string representation of token representing end of utterance
    image_processorr<   r   r%   ZIdeficsImageProcessorZLlamaTokenizerFastN   c                    s   |d u rt d|d u rt dt || | j| _t|dr#|jn|t| _| jj	| jj
| jj
f| _d| jjdg v rDd| _d S d| _d S )Nz)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_token<end_of_utterance>Zadditional_special_tokensTF)
ValueErrorsuper__init__r\   Zcurrent_processorhasattrrG   r?   r@   Zimage_num_channelsr   default_image_dimsr<   Zspecial_tokens_mapget1tokenizer_was_trained_with_end_of_utterance_token)selfr\   r<   r   r%   kwargs	__class__r    r!   rb      s&   zIdeficsProcessor.__init__promptsz5.0.0textT)Zold_nameversionnew_nameZraise_if_both_namesimagesrh   returnc           .         sF  |du r|du rt dt||\}}|du r|}nM|durgt|ttfs(|g}t|tr0|g}t|ttfrCt|t|krCt dtdd |D sPt dt|d ttfr`dd	 |D }tt||}| j	t
fd
| jji|}|d dd}|d dd}	|	du r| j}	tdd |D s|g}d dd}
 fdd}g }g }|D ]|}| jj }g }d}d}t|D ]L\}}|dkr|sdnd}t|tr|d}t|r| j|}|||7 }|| d}q|	r|r||
7 }||7 }d}q|||7 }|| d}q|r|| jj7 }| j|fi |d }|| || q|d dd}| j|fi |d }|d }|d }tdd |D }td|}tdd |D dk}g }g }g }t|||D ]\}} }!|}"|"| j}#t|#|}$|!d|$ }%t|%dkr|dkrtj|g|% dd R  }&|%|&d|%d< nY|dkrt !|%dd }'t j"|g|'gdd }(t j|(|%j#d!}&t !|%d })t $t %|)d"}*|%}+t &|&|*|+}&n|dkrtj|g| j'R  }&n|dkrt |g| j'R }&||& |dkr|t(|" |t(|  qg|dkr.|t j)|"t j*d! ||  qg|dkrEt+|}t+|}t+|}n|dkrYt +|}t +|}t +|}|rmt,|| j|\},}-t-|,||d#},n,|dkrtj|j!d |j!d dtj.d!},n|dkrt j|j!d |j!d dft j.d!},t/||||,d$d%S )&a  This method takes batched or non-batched prompts made of text and images and converts them into prompts that
        the model was trained on and prepares the image pixel values for the model to process.

        Args:
            images (`Union[ImageInput, List[ImageInput], str, List[str], List[List[str]]]`):
                either a single image or a batched list of images - can be passed in when text contains only text prompts,
                in order to use the image-text-to-text behavior.
            text (`Union[List[TextInput], [List[List[TextInput]]]]`):
                either a single prompt or a batched list of prompts - see the detailed description immediately after
                the end of the arguments doc section.
            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                The type of tensors to return. Can be one of:
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.

        Returns:
            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
            directly passed to `model.generate`

        Detailed explanation:

        Each entry in `text` is either a text to be passed as is or an image that will be processed.

        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
        entry into the prompt.

        Example:

        ```python
        checkpoint = "HuggingFaceM4/idefics-9b"
        processor = AutoProcessor.from_pretrained(checkpoint)
        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
        img = processor.image_processor.fetch_images([url])[0]

        prompts = [
            "User:",
            img,
            "Describe this image.
Assistant: An image of two kittens in grass.
",
            "User:",
            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
            "Describe this image.
Assistant:",
        ]

        inputs = processor(text=prompts, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_length=100)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```

        In this example the `prompts` will be converted into:

        ```
        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant: An image of two kittens in grass.
        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant:'
        ```

        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
        `pixel_values` dict entry of the return value.

        This example also exemplifies that images can be passed as objects or as text urls. It can be seen that the
        first image is passed as object and the second one as a url.

        To do training do:

        ```python
        image_transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                transforms.Normalize(mean=self.image_mean, std=self.image_std),
            ]
        )
        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
        ```

        In order to help debug prompt generation enable `debug=True` which will show you what's happening.

        Nz9You need to specify either `text` or `images` and `text`.a  When providing both images and text arguments, the number of text prompts should be the same as the number of images.If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...].c                 s   s    | ]}t |tV  qd S N)
isinstancer   .0ir    r    r!   	<genexpr>l  s    z,IdeficsProcessor.__call__.<locals>.<genexpr>zQWhen using the image-text-to-text behavior, the prompts should only contain text.r   c                 S   s   g | ]}|gqS r    r    rs   r    r    r!   
<listcomp>p  s    z-IdeficsProcessor.__call__.<locals>.<listcomp>Ztokenizer_init_kwargsr(   r$   Fr%   c                 s   s    | ]
}t |ttfV  qd S rq   )rr   listtuplers   r    r    r!   rv     s    z<fake_token_around_image>r   r_   c                    s   | r  S     S rq   r    )last_was_imageZ
fake_tokenr^   r    r!   image_tokens  s   z/IdeficsProcessor.__call__.<locals>.image_tokensTrT   r)   r,   r-   r;   attention_maskc                 s       | ]}t |V  qd S rq   lenrt   xr    r    r!   rv         r>   c                 s   r~   rq   r   r   r    r    r!   rv     r   r0   )Zaxis)dtype)r/   r>   r1   )r;   r}   Zpixel_valuesrE   )data)0r`   r   rr   rx   ry   r   r   rU   zipZ_merge_kwargsr'   r<   Zinit_kwargspoprf   anyZ	bos_tokenrD   striprZ   r\   Zfetch_imagesappendZ	eos_tokenmaxsumrJ   rG   minr5   ZzerosrC   r0   rN   concatr   ZreshaperB   rQ   rd   ZtensorZconvert_to_tensorZint32stackr=   r8   r&   r	   ).rg   ro   rl   ZaudioZvideosrh   rk   Zoutput_kwargsr$   r%   Zend_of_utterance_tokenr|   Zall_promptsZ
all_imagessample	full_textZimage_objectsrz   Zlast_was_textru   itemimager,   text_encodingZ	all_textsZall_attention_masksZmax_num_imagesZat_least_one_imageZoutput_input_idsZoutput_imagesZoutput_attention_masksZtext_singler}   Zextracted_imagesZpadded_input_idsZimage_countZlocal_max_num_imagesZcurrent_imagesZpadded_image_tensorZimage_shapeZpadded_shapeZ
num_imagesrR   rS   rE   _r    r{   r!   __call__   s  b

























zIdeficsProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r<   batch_decoderg   argsrh   r    r    r!   r        zIdeficsProcessor.batch_decodec                 O   r   )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r<   decoder   r    r    r!   r     r   zIdeficsProcessor.decodec                 C   s"   | j j}| jj}tt|| S rq   )r<   model_input_namesr\   rx   dictfromkeys)rg   Ztokenizer_input_namesZimage_processor_input_namesr    r    r!   r     s   z"IdeficsProcessor.model_input_names)Nr]   N)NNNN)r   r   r   __doc__
attributesZvalid_kwargsZimage_processor_classZtokenizer_classrb   r   r   r
   r   r   r   r   r   r'   r	   r   r   r   propertyr   __classcell__r    r    ri   r!   r[      sF     

  r[   )r/   )*r   typingr   r   r   r   r   urllib.parser   Zfeature_extraction_utilsr	   Zimage_utilsr
   Zprocessing_utilsr   r   r   r   r   r   Ztokenization_utils_baser   r   utilsr   r   Zutils.deprecationr   r5   Z
tensorflowr0   r@   r   r#   r'   r8   r=   r9   r:   rZ   r[   __all__r    r    r    r!   <module>   s4    
/!	  
c