o
    Zh^G                  
   @   s  d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZmZ G d	d
 d
eddZG dd deddZdee dedeee  fddZdeeee   deee  dededejf
ddZdedededefddZG dd  d eZd gZdS )!zProcessor class for Mllama.    )ListOptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s   e Zd ZU ee ed< dS )MllamaImagesKwargsmax_image_tilesN)__name__
__module____qualname__r   int__annotations__ r   r   [/var/www/auris/lib/python3.10/site-packages/transformers/models/mllama/processing_mllama.pyr      s   
 r   F)totalc                   @   s"   e Zd ZU eed< dddiiZdS )MllamaProcessorKwargsimages_kwargsZimage_kwargsr      N)r   r   r   r   r   	_defaultsr   r   r   r   r   #   s   
 r   	input_idsimage_token_idreturnc                    s    fddt | D }t|dkrg S t|dkr |d dggS dd t|dd |dd D }||d t| g |d d }|ddd D ]}|d |d d krZ||d< |d }qJ|S )a  
    Generate a cross-attention token mask for image tokens in the input sequence.

    This function identifies the positions of image tokens in the input sequence and creates
    a mask that defines which subsequent tokens each image token should attend to.

    Args:
        input_ids (List[int]): A list of token ids representing the input sequence.
        image_token_id (int): The id of the token used to represent images in the sequence.

    Returns:
        List[List[int]]: A list of [start, end] pairs, where each pair represents the range
        of tokens an image token should attend to.

    Notes:
        - If no image tokens are present, an empty list is returned.
        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
    c                    s   g | ]
\}}| kr|qS r   r   ).0itokenr   r   r   
<listcomp>C       z2get_cross_attention_token_mask.<locals>.<listcomp>r      c                 S   s   g | ]\}}||gqS r   r   )r    loc1loc2r   r   r   r$   L       N)	enumeratelenzipappend)r   r   Zimage_token_locationsZvision_masksZlast_mask_endZvision_maskr   r#   r   get_cross_attention_token_mask-   s   $
r/   cross_attention_token_mask	num_tilesmax_num_tileslengthc              	   C   s   t | }tdd | D }tj||||ftjd}tt| |D ]5\}\}}	tt||	D ]'\}
\}}t |dkrT|\}}t||}|dkrH|}d|||||
d|f< q-q |S )a  
    Convert the cross attention mask indices to a cross attention mask 4D array.

    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

    Args:
        cross_attention_token_mask (List[List[List[int]]]): A nested list structure where:
            - The outer list represents the batch dimension.
            - The middle list represents different images within each batch item.
            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
        num_tiles (List[List[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
        max_num_tiles (int): The maximum possible number of tiles.
        length (int): The total sequence length of the input.

    Returns:
        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
            The array contains `1` where attention is allowed and `0` where it is not.

    Note:
        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
    c                 S      g | ]}t |qS r   r,   )r    masksr   r   r   r$   {       z@convert_sparse_cross_attention_mask_to_dense.<locals>.<listcomp>)shapeZdtype   r'   r&   N)r,   maxnpZzerosZint64r+   r-   min)r0   r1   r2   r3   Z
batch_sizeZmax_num_imagescross_attention_maskZ
sample_idxZsample_masksZsample_num_tilesZmask_idx	locationsZmask_num_tilesstartendr   r   r   ,convert_sparse_cross_attention_mask_to_dense]   s"   

rA   prompt	bos_tokenimage_tokenc                 C   sP   || v r| S d}|  |r| t|d } |d7 }|  |s||  | |  S )a\  
    Builds a string from the input prompt by adding `bos_token` if not already present.

    Args:
        prompt (`str`):
            The input prompt string.
        bos_token (`str`):
            The beginning of sentence token to be added.
        image_token (`str`):
            The image token used to identify the start of an image sequence.

    Returns:
        str: The modified prompt string with the `bos_token` added if necessary.

    Examples:
        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'

        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
        '<|image|><begin_of_text>Hello world'

        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'
    r   Nr&   )
startswithr,   )rB   rC   rD   Znum_image_tokens_on_startr   r   r   build_string_from_input   s   

rF   c                       s   e Zd ZdZddgZdgZdZdZd fdd		Z				dd
e	e
 de	eeeee ee f  dee defddZdd Zdd Z	dddZedd Z  ZS )MllamaProcessora  
    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import MllamaProcessor
        from PIL import Image

        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

        processor(
            images=your_pil_image,
            text=["<|image|>If I had to write a haiku for this one"],
            images_kwargs = {"size": {"height": 448, "width": 448}},
            text_kwargs = {"padding": "right"},
            common_kwargs = {"return_tensors": "pt"},
        )
        ```

    Args:
        image_processor ([`MllamaImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    image_processor	tokenizerchat_templateZMllamaImageProcessorZPreTrainedTokenizerFastNc                    sb   t |dsd| _|| j| _n|j| _|j| _d| _|| j| _|j| _t j|||d d S )NrD   z	<|image|>z<|python_tag|>)rJ   )	hasattrrD   Zconvert_tokens_to_idsr   Zpython_tokenZpython_token_idrC   super__init__)selfrH   rI   rJ   	__class__r   r   rM      s   
zMllamaProcessor.__init__imagestextkwargsr   c                    s|  |du r|du rt d jtfd jji|}|d }d|d< |d }|d }	i }
|durt|tr8|g}nt|ttfrHt	dd	 |D sLt d
 fdd|D } fdd|D }|
dd} j|fi |} j||dgd  fdd|d D }|
| dg}|durt|}dd |D }|durtdd	 |D rt	dd	 |D st dt|dkr||ks||kr|du rt dd}t|t|kr||krd}n||krd}t d| d| d| |dur j|fi |}|
d}|
| |dur0|dur0 fdd|d D }t|| jjtd d	 |d D d!}||
d"< |	
dd}t|
|d#}|S )$a&	  
        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` arguments to
        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
        to the docstring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
        Nz'You must specify either text or images.Ztokenizer_init_kwargstext_kwargsreturn_tensorsr   common_kwargsc                 s   s    | ]}t |tV  qd S N)
isinstancestrr    tr   r   r   	<genexpr>  s    z+MllamaProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                       g | ]}|  jqS r   )countrD   rZ   rN   r   r   r$     r*   z,MllamaProcessor.__call__.<locals>.<listcomp>c                    s   g | ]
}t | j jqS r   )rF   rC   rD   )r    Z	text_itemr_   r   r   r$      r%   Zpadding_sideimage)Z
modalitiesc                    r]   r   )r^   r   r    Z	token_idsr_   r   r   r$   $  r*   r   r   c                 S   r4   r   r5   )r    sampler   r   r   r$   *  r7   c                 s   s    | ]}|d kV  qdS )r   Nr   )r    Z	batch_imgr   r   r   r\   -      zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r1   c                    s   g | ]}t | jqS r   )r/   r   ra   r_   r   r   r$   K  s    c                 s   s    | ]}t |V  qd S rW   r5   )r    r   r   r   r   r\   R  rc   )r1   r2   r3   r=   )dataZtensor_type)
ValueErrorZ_merge_kwargsr   rI   Zinit_kwargsrX   rY   listtupleallpopZ_check_special_mm_tokensupdater   anysumrH   rA   r   r:   r   )rN   rQ   rR   ZaudioZvideosrS   Zoutput_kwargsrT   r   rV   re   Zn_images_in_text_encodingZn_images_in_idsZn_images_in_imagesZadd_messageZimage_featuresr1   r0   r=   rU   Zbatch_featurer   r_   r   __call__   s   '
 




zMllamaProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        rI   batch_decoderN   argsrS   r   r   r   rs   [     zMllamaProcessor.batch_decodec                 O   rq   )z
        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )rI   decodert   r   r   r   rw   b  rv   zMllamaProcessor.decodeTFc                 K   s   | j j|f||d|S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `List[str]`: The decoded text.
        )skip_special_tokensclean_up_tokenization_spacesrr   )rN   Zgenerated_outputsrx   ry   rS   r   r   r   post_process_image_text_to_texti  s   z/MllamaProcessor.post_process_image_text_to_textc                 C   s0   | j j}| jj}dd |D }t|| dg S )Nc                 S   s   g | ]}|d kr|qS )r1   r   )r    namer   r   r   r$     r*   z5MllamaProcessor.model_input_names.<locals>.<listcomp>r=   )rI   model_input_namesrH   rg   )rN   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   r|     s   z!MllamaProcessor.model_input_namesrW   )NNNN)TF)r   r   r   __doc__
attributesZvalid_kwargsZimage_processor_classZtokenizer_classrM   r   r   r   r   r   r   r   r   r   rp   rs   rw   rz   propertyr|   __classcell__r   r   rO   r   rG      s4    
w
rG   )r}   typingr   r   r   numpyr;   Zfeature_extraction_utilsr   Zimage_utilsr   r   Zprocessing_utilsr	   r
   r   r   Ztokenization_utils_baser   r   r   r   r   r/   ZndarrayrA   rY   rF   rG   __all__r   r   r   r   <module>   s2   "
0

0% 
^