o
    Zho3                     @   sr  d dl mZmZmZ d dlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZ e rQddlmZ e rde r^d dlmZ nd dl mZ e rkd dl!Z!ddlm"Z" e"#e$Z%dZ&dZ'dZ(dZ)dZ*d#ddZ+de,d dee- fddZ.de-de/e-e-f fddZ0G dd deZ1edd G d!d" d"eZ2d"gZ3dS )$    )ListOptionalUnionN   )BatchFeature)IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDSizeDict)UnpackVideosKwargs)
TensorTypeis_torch_availableis_torchvision_availableis_torchvision_v2_availableis_vision_available)requires)BaseVideoProcessor)group_videos_by_shapereorder_videos)PILImageResampling)
functional)loggingzYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.zgYou are provided the following series of {frame_count} frames from a {video_duration} [H:MM:SS] video.
z

z
Frame from {timestamp}:i   c                 C   s  t | dd}|dkrtd| dt | dd}t | dd}|dkr*td| dtt|| }t||}|d	k r=d	}d}	|d	 }
|dkr_|d
|  || kr_t|| }	t|||  }
td|	}	t|
|d	 }
|	|
krvd|d	 }	}
tj|	|
|td}t|}|S )a]  
    Example sampling function which:
      - Uses `max_frames` (if provided) or calculates it from `fps` and metadata.
      - Applies a basic center-skip if fewer frames than available, otherwise
        optionally skips `skip_secs` from both the start and end.
      - Uniformly samples the desired number of frames between the start and end indices.

    Args:
        max_frames (`int`):
            Maximum number of frames to sample.
        target_fps (`int`):
            Target frames to sample per second.
        metadata (`dict`):
            Contains video metadata such as "n_frames" and "video_fps".
        skip_secs (`float`, *optional*, defaults to 1.0):
            Number of seconds to skip from the start and end if the video is long enough.

    Returns:
        numpy.ndarray:
            An array of unique frame indices to sample.
    total_num_framesr   zInvalid total_num_frames=z in metadata.Zfpsg      >@durationzInvalid duration_seconds=      Zdtype)	getattr
ValueErrorintroundminmaxnpZlinspaceunique)metadata
max_framesZ
target_fpsZ	skip_secsr   Z
native_fpsZduration_secondsZestimated_framesZdesired_framesZ	start_idxZend_idxindices r(   c/var/www/auris/lib/python3.10/site-packages/transformers/models/smolvlm/video_processing_smolvlm.pysmolvlm_sample_indices_fnG   s.   


r*   videostorch.Tensorreturnc                 C   sF   t d }}| D ]}| dd \}}t||}t||}q||fS )zH
    Get the maximum height and width across all videos in a batch.
    z-infN)floatsizer"   )r+   
max_height	max_widthvideoheightwidthr(   r(   r)   get_max_height_width   s   
r6   resolution_max_sidec                 C   s   |   dd \}}tt|}|du rt||n|}|| }||kr5|}t|| }|d dkr4|d7 }n||krK|}t|| }|d dkrK|d7 }t|d}t|d}||fS )a  
    Get the output size of the video after resizing given a dictionary specifying the max and min sizes.
    Args:
        video (`np.ndarray`):
            Video to resize.
        resolution_max_side (`int`):
            The longest edge of the video will be resized to this value. The shortest edge will be resized to keep the
            input aspect ratio.
    Returns:
        The output size of the video after resizing.
    r.   Nr   r   r   )r0   r!   MAX_IMAGE_SIZEr"   r   )r3   r7   r4   r5   Zaspect_ratior(   r(   r)   get_resize_output_image_size   s$   


r9   c                   @   s   e Zd ZdS )SmolVLMVideoProcessorInitKwargsN)__name__
__module____qualname__r(   r(   r(   r)   r:      s    r:   )Ztorchvision)backendsc                       s  e Zd ZejZddiZeZe	Z
dZdZdZdZdZeZddgZdee f fddZ			d%d
ddedddeddf
ddZ		d&d
ddeeef dedefddZ		d'ded dedededed dedededed eeeee f  d!eeeee f  d"eeee f  fd#d$Z!  Z"S )(SmolVLMVideoProcessorlongest_edgei  Tpixel_valuespixel_attention_maskkwargsc                    s   t  jdi | d S )Nr(   )super__init__)selfrC   	__class__r(   r)   rE      s   zSmolVLMVideoProcessor.__init__Nr3   r,   r0   interpolationzF.InterpolationMode	antialiasr-   c                 K   s   |dur|nt jj}|t jjkrtd t jj}|jr$t||jd}n|j	r1|j
r1|j	|j
f}ntd| dt j||||dS )a9  
        Resize an video to `(size["height"], size["width"])`.
        Args:
            video (`torch.Tensor`):
                Video to resize.
            size (`SizeDict`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                `InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`.
        Returns:
            `torch.Tensor`: The resized video.
        NzYou have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. BICUBIC resample will be used as an alternative. Please fall back to image processor if you want full consistency with the original model.)r7   zHSize must contain 'height' and 'width' keys, or 'longest_edge' key. Got .)rI   rJ   )FZInterpolationModeZBILINEARLANCZOSloggerZwarning_onceZBICUBICr@   r9   r4   r5   r   resize)rF   r3   r0   rI   rJ   rC   new_sizer(   r(   r)   rO      s   zSmolVLMVideoProcessor.resizer   padded_sizefillreturn_pixel_maskc           
      C   s   |  dd }|d |d  }|d |d  }|dk s |dk r+td| d| d||kr=dd||g}tj|||d}d}	|rbtj|d	dddddf tjd
}	d|	d	d|d d|d f< ||	fS )a  Pads the sample with empty video to the padded_size
        Args:
            video (`torch.Tensor`):
                Video to pad.
            padded_size (`Tuple[int, int]`):
                Height and width to pad.
            fill (`int`, *optional*):
                The value to use for the padding.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
        r.   Nr   r   zzPadding dimensions are negative. Please make sure that the padded size is larger than the original size. Got padded size: z, original size: rK   )rR   .r   )r0   r   rL   padtorchZ
zeros_likeZint64)
rF   r3   rQ   rR   rS   Zoriginal_sizepadding_bottompadding_rightpaddingZ
pixel_maskr(   r(   r)   rT      s&   $zSmolVLMVideoProcessor.padr+   do_convert_rgb	do_resize
do_rescalerescale_factordo_normalizedo_pad
image_mean	image_stdreturn_tensorsc              	   K   sV  t |\}}i }| D ]\}}|r| |}|r!| j|||d}|||< qt||}t |\}}i }| D ]\}}| |||||
|}|||< q7t||}|	rt|}t |\}}i }i }| D ]\}}| j||d\}}|||< |||< qct||}t||}|rtj	|ddn|}d|i}|	r|	r|d urtj	|ddn||d< t
||dS )N)r0   rI   )rQ   r   )dimrA   rB   )Ztensor_type)r   itemsZconvert_to_rgbrO   r   Zrescale_and_normalizer6   rT   rU   stackr   )rF   r+   rY   rZ   r0   rI   r[   r\   r]   r^   r_   r`   ra   rC   Zgrouped_videosZgrouped_videos_indexZresized_videos_groupedshapeZstacked_videosZresized_videosZprocessed_videos_groupedZprocessed_videosZpad_sizeZprocessed_padded_mask_groupedZpadded_masksrB   datar(   r(   r)   _preprocess  sL   







z!SmolVLMVideoProcessor._preprocess)NT)r   T)N)#r;   r<   r=   r   rM   Zresampler0   r   r_   r   r`   rZ   r[   r]   rY   r^   r:   Zvalid_kwargsZmodel_input_namesr
   rE   r	   boolrO   tupler   rT   r   r   r/   r   strr   rg   __classcell__r(   r(   rG   r)   r?      s~    
.

3	
r?   )r   )4typingr   r   r   numpyr#   Zimage_processing_utilsr   Zimage_utilsr   r   r	   Zprocessing_utilsr
   r   utilsr   r   r   r   r   Zutils.import_utilsr   Zvideo_processing_utilsr   Zvideo_utilsr   r   r   Ztorchvision.transforms.v2r   rL   Ztorchvision.transformsrU   r   Z
get_loggerr;   rN   ZDEFAULT_SYSTEM_MESSAGEZDEFAULT_VIDEO_INTROZDEFAULT_MEDIA_OUTTROZFRAME_TIMESTAMP_MESSAGEr8   r*   listr   r6   ri   r9   r:   r?   __all__r(   r(   r(   r)   <module>   sJ   

<

( 
 