
    fTh^G              
       >   S r SSKJrJrJr  SSKrSSKJr  SSK	J
r
Jr  SSKJrJrJrJr  SSKJrJr   " S	 S
\SS9r " S S\SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R0                  4
S jrS\S\S\S\4S jr " S S\5      rS/rg)zProcessor class for Mllama.    )ListOptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    \ rS rSr% \\   \S'   Srg)MllamaImagesKwargs   max_image_tiles N)__name__
__module____qualname____firstlineno__r   int__annotations____static_attributes__r       d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    c]"r   r   F)totalc                   ,    \ rS rSr% \\S'   SSS00rSrg)MllamaProcessorKwargs#   images_kwargsimage_kwargsr      r   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r    r    #   s    %% 	q
Ir   r    	input_idsimage_token_idreturnc                    [        U 5       VVs/ s H  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ s H  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  XS'   U	S   nM     U$ s  snnf s  snnf )a  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (List[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    List[List[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r      N)	enumeratelenzipappend)
r&   r'   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr8   -   s   , 09/C_/C81uG^Q/C_
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mn3mZTTL3mLn .r2C	NCD
 !$Q'M#DbD)q>[^a//*N#A *
 / ` os   CC$Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	         [        U 5      n[        U  Vs/ s H  n[        U5      PM     sn5      n[        R                  " XCXb4[        R                  S9n[        [        X5      5       H[  u  nu  p[        [        X5      5       H;  u  nu  p[        U5      S:X  d  M  Uu  p[        X5      nUS:X  a  UnSXxX2USU24'   M=     M]     U$ s  snf )a  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (List[List[List[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (List[List[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
)shapedtype   r+   r*   N)r-   maxnpzerosint64r,   r.   min)r9   r:   r;   r<   
batch_sizemasksmax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                   r   ,convert_sparse_cross_attention_mask_to_denserR   ]   s    : /0J2LM2L#e*2LMNN88>Ahh
 9B#F`Bl8m4
4\5>s<?b5c1H1y9~"&
#&"9 CYZ$Ho~o%UV 6d 9n   Ns   Cprompt	bos_tokenimage_tokenc                     X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr*   )
startswithr-   )rS   rT   rU   num_image_tokens_on_starts       r   build_string_from_inputrY      so    4  !


K
(
(K(*+!Q&! 

K
(
( 56yk&JJr   c                      ^  \ rS rSrSrSS/rS/rSrSrSU 4S jjr	    SS	\
\   S
\
\\\\\   \\   4      S\\   S\4S jjrS rS r SS jr\S 5       rSrU =r$ )MllamaProcessor   a  
Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import MllamaProcessor
    from PIL import Image

    processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

    processor(
        images=your_pil_image,
        text=["<|image|>If I had to write a haiku for this one"],
        images_kwargs = {"size": {"height": 448, "width": 448}},
        text_kwargs = {"padding": "right"},
        common_kwargs = {"return_tensors": "pt"},
    )
    ```

Args:
    image_processor ([`MllamaImageProcessor`]):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

image_processor	tokenizerchat_templateMllamaImageProcessorPreTrainedTokenizerFastc                 H  > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )NrU   z	<|image|>z<|python_tag|>)r_   )	hasattrrU   convert_tokens_to_idsr'   python_tokenpython_token_idrT   super__init__)selfr]   r^   r_   	__class__s       r   rh   MllamaProcessor.__init__   s    y-00*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,=Qr   imagestextkwargsr(   c           
      H   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   nSUS'   US   nUS   n	0 n
UGb%  [        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S	5      eU Vs/ s H  oR                  U R                  5      PM     nnU Vs/ s H#  n[        XR                  U R                  5      PM%     nnUR                  S
S5      nU R                  " U40 UD6nU R                  X/S/S9  US    Vs/ s H  nUR                  U R                   5      PM      nnU
R#                  U5        S/nUb&  [%        U5      nU Vs/ s H  n['        U5      PM     nnUb  [)        S W 5       5      (       a"  [        S U 5       5      (       d  [        S5      e[+        U5      S:  aY  UU:w  d  WU:w  aM  Uc  [        S5      eSn[+        U5      [+        U5      :X  a	  UU:w  a  SnOWU:w  a  Sn[        SU SU SU 35      eUb5  U R,                  " U40 UD6nUR                  S5      nU
R#                  U5        Ubd  Uba  WS    Vs/ s H  n[/        UU R                   5      PM     nn[1        UWU R,                  R2                  [5        S US    5       5      S9nUU
S'   U	R                  SS5      n[7        U
US9nU$ s  snf s  snf s  snf s  snf s  snf )aF  
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsr"   common_kwargsc              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancestr).0ts     r   	<genexpr>+MllamaProcessor.__call__.<locals>.<genexpr>  s     =_Z^UVjC>P>PZ^s   zAInvalid input text. Please provide a string, or a list of stringspadding_sideimage)
modalitiesr&   r   c              3   *   #    U  H	  oS :H  v   M     g7fr   Nr   rx   	batch_imgs     r   rz   r{   -  s     D3Ci>3C   c              3   *   #    U  H	  oS :H  v   M     g7fr   r   r   s     r   rz   r{   -  s      Q0@9Q0@r   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r:   c              3   8   #    U  H  n[        U5      v   M     g 7fru   )r-   )rx   r&   s     r   rz   r{   R  s     Q;Pi3y>>;Ps   )r:   r;   r<   rI   )datatensor_type)
ValueError_merge_kwargsr    r^   init_kwargsrv   rw   listtupleallcountrU   rY   rT   pop_check_special_mm_tokensr'   updater	   r-   anysumr]   r8   rR   r   rA   r   )ri   rl   rm   audiovideosrn   output_kwargsrq   r"   rs   r   ry   n_images_in_text	text_item_encoding	token_idsn_images_in_idsn_images_in_imagessampleadd_messageimage_featuresr:   r9   rI   rr   batch_features                              r   __call__MllamaProcessor.__call__   s   N <FNFGG**!
"&.."<"<
 
 $M2(,$%%o6%o6$$$v e}55#=_Z^=_:_:_ !deeCGH4a(8(8 94Hjnojn]f+I~~tGWGWXjnDo5A~~d:k:H))$gY)OU]^iUjkUj	yt/B/BCUjOkKK!S/7F<B!CF&#f+F!CD3CDDDS Q0@Q N N !w  #$q("&66/M_:_>$%ghh"$K-.#6F2GGL^brLr 'C(,>> 'Q$CDTCU V@@R?SSVWbVce 
 !11&JMJN&**;7IKK' $"2`hit`u*`uS\.y$:M:MN`u ' * $P*#"22BBQ8K;PQQ	$  ,@D'(&**+;TB$$NKu  Io l "DB*s   3$L*L%LL Lc                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
r^   batch_decoderi   argsrn   s      r   r   MllamaProcessor.batch_decode[  s    
 ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r^   decoder   s      r   r   MllamaProcessor.decodeb  s    
 ~~$$d5f55r   c                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `List[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spacesr   )ri   generated_outputsr   r   rn   s        r   post_process_image_text_to_text/MllamaProcessor.post_process_image_text_to_texti  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr:   rI   )r^   model_input_namesr]   r   )ri   tokenizer_input_namesimage_processor_input_namesnames       r   r   !MllamaProcessor.model_input_names  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&k)GKaJbbcc 'ls
   	AA)rT   rU   r'   re   rf   ru   )NNNN)TF)r   r   r   r   __doc__
attributesvalid_kwargsimage_processor_classtokenizer_classrh   r   r   r   r   r   r   r   r    r   r   r   r   r   propertyr   r   __classcell__)rj   s   @r   r[   r[      s    > $[1J#$L2/OR (,hlu$u uY(94	?DQbLccdeu ./u 
un<6 Y^
6 d dr   r[   )r   typingr   r   r   numpyrB   feature_extraction_utilsr   image_utilsr   r	   processing_utilsr
   r   r   r   tokenization_utils_baser   r   r   r    r   r8   ndarrayrR   rw   rY   r[   __all__r   r   r   <module>r      s     " ( (  4 A V V#U #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJZdn Zdz 
r   