
    fTh'C                         S SK JrJrJr  S SKrS SKJrJrJ	r	J
r
  S SKJrJr  SSKJr  SSKJrJrJr  SSKJrJrJrJr   " S	 S
\SS9r " S S\SS9r " S S\	5      rS/rg)    )ListOptionalUnionN)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput   )BatchFeature)
ImageInputconcatenate_listmake_flat_list_of_images)
VideoInputVideoMetadata
load_videomake_batched_videosc                   F    \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)InternVLImagesKwargs&   crop_to_patchesmin_patchesmax_patches N)	__name__
__module____qualname____firstlineno__r   bool__annotations__int__static_attributes__r       h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.pyr   r   &   s     d^###r$   r   F)totalc                   4    \ rS rSr% \\S'   SS0SS00 S.rSrg	)
InternVLProcessorKwargs,   images_kwargspadding_sideleftr   T)text_kwargsr*   videos_kwargsr   N)r   r   r   r   r   r!   	_defaultsr#   r   r$   r%   r(   r(   ,   s,    '' F
 t
 Ir$   r(   c                     ^  \ rS rSrSr/ SQrSS/rSrSrSr	     S"S\
4U 4S	 jjjrS
\\   S\\
   S\\
   S\R                  S\R                  S\R                  4S jr    S#S\\   S
\\\\\\   \\   4      S\\   S\\   S\4
S jjr S$S\S\\
   S\\\\
4   4S jjrS r S r!\"S 5       r#  S%S\\S4   S\\
   S\S\S\RH                  4
S  jjr%S!r&U =r'$ )&InternVLProcessor9   a  
Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
[`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
Args:
    image_processor ([`AutoImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`AutoVideoProcessor`], *optional*):
        The video processor is a required input.
    image_seq_length (`int`, *optional*, defaults to 256):
        The number of image token to use per image patch. it should be set so that:
        image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)image_processor	tokenizervideo_processorchat_templateimage_seq_lengthAutoImageProcessorAutoVideoProcessorAutoTokenizerc                    > X@l         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        TU ]$  " XU4SU0UD6  g )Nr6   )
r7   start_image_tokenend_image_tokencontext_image_tokenimage_tokenvideo_tokencontext_image_token_idimage_token_idsuper__init__)selfr3   r4   r5   r7   r6   kwargs	__class__s          r%   rD   InternVLProcessor.__init__U   sl     !1!*!<!<(88$88$00'>>_lTaleklr$   textimage_num_patchesvideo_num_patchesimage_num_patches_indicesvideo_num_patches_indicesvideo_patch_indicesc	           	      x  ^ ^ Sn	Sn
/ n/ n/ nU GH$  nUnT R                   U;   d  T R                  U;   Ga  T R                   U;   a  T R                  U;  d8  UR                  T R                   5      UR                  T R                  5      :  a  U	S:  a  XiS-
     OSnXi   nUR                  UUU 5        UR	                  T R                   SS5      nUR                  T R
                   T R                   T R                  -  XI   -   T R                   35        U	S-  n	OU
S:  a  XS-
     OSnX   nU
S:  a  UU   OSnUUS-
     nUR                  UUU 5        [        UUU 5      mSR                  UU 4S j[        [        T5      5       5       5      nUR                  U5        UR	                  T R                  SS5      nU
S-  n
T R                   U;   a  GM  T R                  U;   a  GM  SU;   a,  UR                  S5      nUR	                  SUS5      nSU;   a  M,  UR                  U5        GM'     XX4$ )z
Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
image and video tokens while keeping track of the patches used.
r      z<placeholder>
c              3      >#    U  HE  nS US-    STR                    TR                  TR                  -  TU   -   TR                   3v   MG     g7f)FramerP   z: N)r<   r?   r7   r=   ).0inum_patchesrE   s     r%   	<genexpr>?InternVLProcessor._insert_media_placeholders.<locals>.<genexpr>   sr      -!8A  Awb)?)?(@AQAQTXTiTiAilwxylzAz@{  }A  }Q  }Q  |R  S!8s   AA)r?   r@   indexappendreplacer<   r7   r=   listjoinrangelenpop)rE   rI   image_pixel_valuesvideo_pixel_valuesrJ   rK   rL   rM   rN   image_indexvideo_indexprocessed_textimage_video_patchesreplace_stringsprompt
new_promptstart_index	end_indexcurrent_patch_indexend_patch_indexvideo_promptreplace_strrV   s   `                     @r%   _insert_media_placeholders,InternVLProcessor._insert_media_placeholdersg   s      FJ""j0D4D4D
4R##z1$$J6!''(8(89J<L<LTM]M]<^^ Q\^_P_";!O"LefK 9 FI'../A+i/XY!+!3!3D4D4DoWX!YJ#**11243C3CdF[F[3[^o^|3|2}  C  S  S  ~T  U  1$K
 S^`aRa*=Ao*Ngh'&9&FOT_bcTc";<O"PijK 9/A:M NI'../A+i/XY"&'89L_']"^K#'99 -!&s;'7!8- $L $**<8!+!3!3D4D4DoWX!YJ1$KA ""j0D4D4D
4RB "Z/-11!4'//aP
 "Z/ !!*-M P KLLr$   imagesvideosrF   returnc           
         Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        [        45      (       d  U/n/ n/ n0 n	Sn
Sn[        R                  " S/5      n[        R                  " S/5      n[        R                  " S/5      nUbZ  [        U5      nU R                  " SSU0US   D6nUR                  S5      nUR                  S5      n
[        R                  " U5      nUb  [        U5      nU Vs/ s H  n[        U5      PM     nn[        R                  " U5      nU R                   " SS	U0US
   D6nU VVs/ s H  n[#        U5        H  nSPM     M     nnnUR                  S5      R%                  SS5      n[        R                  " U5      nUc  Ubd  U R'                  UU
UUUUUU5      u  nnnnUb  U[        U5      :w  a  [        S5      eUb  U[        U5      :w  a  [        S5      eS[)        U5      0n	US   R                  SS5      nU R                  " U40 US   D6nU R+                  UUS/S9  [-        0 UEU	EUS9$ s  snf s  snnf )a  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
NzYou have to specify text.tokenizer_init_kwargsr   rr   r*   rV   pixel_valuesrs   r.   rP   pixel_values_videoszONumber of image placeholders in the prompt does not match the number of images.zONumber of video placeholders in the prompt does not match the number of videos.r-   return_tensorsimage)
modalities)datatensor_typer   )
ValueError_merge_kwargsr(   r4   init_kwargs
isinstancer\   tuplenparrayr   r3   r`   cumsumr   r_   r5   r^   flattenrp   r   _check_special_mm_tokensr   )rE   rr   rI   audiors   rF   output_kwargsrJ   rK   image_videos_inputsra   rb   rL   rN   rM   image_inputsvideonum_frames_per_videovideo_inputsframes_rf   rc   rd   ry   text_inputss                             r%   __call__InternVLProcessor.__call__   s   R <899**#
"&.."<"<
 
 $u..6D  !!$&HHaSM! hhsm$&HHaSM!-f5F//`v`A_`L , 0 0 ?!-!1!1.!A(*		2C(D%(0F<B#CF5CJF #C"$)),@"A//`v`A_`L1E ]1EvuU[}!}1E ]!-!1!12G!H!P!PQRTU!V(*		2C(D%!3BFBaBa""!!))#	C?D%{K !kS[&@ !rss!kS[&@ !rss $23CDW3X"Y&}599:JDQnnTJ]=-IJ%%dKWI%N!GK!G3F!GUcdd= $D !^s   I/!I4metadata
num_framesinitial_shiftc                     Ub  UOUR                   nUSL a  UR                   U-  S-  n[        R                  " X1R                   UR                   U-  5      R                  [        5      nU$ )aR  
The function to generate indices of frames to sample from a video.

Args:
    metadata (`VideoMetadata`):
        `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
    num_frames (`int`, *optional*):
        Number of frames to sample uniformly. If None, all frames are sampled.
    initial_shift (`bool`, `float` or `int`, defaults to `0`):
        The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.

Returns:
    `np.ndarray`: Array of frame indices to sample.
T   )total_num_framesr   arangeastyper"   )rE   r   r   r   indicess        r%   sample_indices_fn#InternVLProcessor.sample_indices_fn  sj    " $.#9Zx?X?X
D $55
BQFM))M+D+DhF_F_blFlmtt
 r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r4   batch_decoderE   argsrF   s      r%   r   InternVLProcessor.batch_decode'  s    
 ~~**D;F;;r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r4   decoder   s      r%   r   InternVLProcessor.decode.  s    
 ~~$$d5f55r$   c                     U R                   R                  nU R                  R                  n[        U5      [        U5      -   $ )N)r4   model_input_namesr3   r\   )rE   tokenizer_input_namesimage_processor_input_namess      r%   r   #InternVLProcessor.model_input_names5  s;     $ @ @&*&:&:&L&L#)*T2M-NNNr$   r   r   backendc                 6   ^ ^^ UUU 4S jn[        XUS9u  pX4$ )aR  
Loads `video` to a numpy array.

Args:
    video (`str` or `VideoInput`):
        The video to convert to the numpy array format. Can be a link to video or local path.
    num_frames (`int`, *optional*):
        Number of frames to sample uniformly. If not passed, the whole video is loaded.
    backend (`str`, *optional*, defaults to `"pyav"`):
        The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav".
    initial_shift (`bool`, *optional*, defaults to `True`):
        The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.

Returns:
    Tuple[`np.array`, Dict]: A tuple containing:
        - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
        - Metadata dictionary.
c                 0   > TR                   " U 4TTS.UD6$ )N)r   r   )r   )r   	fn_kwargsr   r   rE   s     r%   sample_indices_fn_funcGInternVLProcessor._load_video_for_model.<locals>.sample_indices_fn_funcW  s"    ))(tzYftjsttr$   )r   r   )r   )rE   r   r   r   r   rF   r   r   s   ` ` `   r%   _load_video_for_model'InternVLProcessor._load_video_for_model<  s"    6	u %UOefr$   )r=   r7   r?   rB   r<   r@   )NNN   N)NNNN)NT)pyavT)(r   r   r   r   __doc__
attributesvalid_kwargsimage_processor_classvideo_processor_classtokenizer_classr"   rD   r\   strr   ndarrayrp   r   r   r   r   r
   r   r   r	   r(   r   r   r   r    floatr   r   r   propertyr   r   r   r#   __classcell__)rG   s   @r%   r1   r1   9   s   $ EJL 10%O  #m
 m m$>M3i>M
  9>M  9>M $&::>M $&::>M  ZZ>MD (,hl'+de$de uY(94	?DQbLccdede
 $de 01de 
deN sw%3;C=X]^bdikn^nXo4<6 O O "S,&' SM 	
  
 r$   r1   )typingr   r   r   numpyr   transformers.processing_utilsr   r   r   r	   $transformers.tokenization_utils_baser
   r   image_processing_utilsr   image_utilsr   r   r   video_utilsr   r   r   r   r   r(   r1   __all__r   r$   r%   <module>r      sl   " ) (   N 2 
 V U<u 
.e 
b bJ	 
r$   