o
    Zhm                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlZd dlZddlmZmZ dd	lmZmZmZ dd
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& e# rvd dl'Z(d dl)Z(e" rvd dl*mZ+ e  r}d dl,Z,e%-e.Z/eed dded ed eed  eed  eed  f Z0eG dd dZ1dd Z2dd Z3dd Z4dd Z5dej6de7fddZ8dee0 deed  fd d!Z9deed  fd"d#Z:dOdej6d$edee;e;f fd%d&Z<dOd'e;d(ee; fd)d*Z=dPd+e1fd,d-Z>d.e?d/efd0d1Z@	dOd.e?d/ee fd2d3ZAd.e?d/efd4d5ZBd.e?d/efd6d7ZCeAe@eBeCd8ZD			9	dQdee?d:f d(ee; d;ee; d<e?d/ee dejEfd=d>ZF		dPdejEd?ee d@eee?ef  dejEfdAdBZGejHdCddfdej6dDee;ee;e;f e
ee;e;f  f dEedFeeIe
eI f d?eee?ef  d@eee?ef  dej6fdGdHZJded dee	ee;e;f ed f e	e;eee;e;f e;f f f fdIdJZKdKe	ee;e;f df dLe	e;ee;e;f f ded fdMdNZLdS )R    N)redirect_stdout)	dataclass)BytesIO)CallableDictIterableListOptionalTupleUnion)urlparse   )PaddingModeto_channel_dimension_format)ChannelDimensioninfer_channel_dimension_formatis_valid_image)is_av_availableis_cv2_availableis_decord_availableis_numpy_arrayis_torch_availableis_torch_tensoris_torchvision_availableis_vision_availableis_yt_dlp_availableloggingrequires_backends)iozPIL.Image.Image
np.ndarraytorch.Tensorznp.ndarrrayc                   @   s.   e Zd ZU eed< eed< eed< eed< dS )VideoMetadatatotal_num_framesfpsdurationvideo_backendN)__name__
__module____qualname__int__annotations__floatstr r-   r-   G/var/www/auris/lib/python3.10/site-packages/transformers/video_utils.pyr!   E   s
   
 r!   c                 C   s(   t | tjjpt| st| o| jdkS )N   )
isinstancePILZImager   r   ndim)framer-   r-   r.   is_valid_video_frameM   s   r4   c                 C   s:   t | ttfst| st| o| jdkS tdd | D S )N   c                 s   s    | ]}t |V  qd S N)r4   .0r3   r-   r-   r.   	<genexpr>V   s    z!is_valid_video.<locals>.<genexpr>)r0   listtupler   r   r2   allvideor-   r-   r.   is_valid_videoS   s   r?   c                 C   sL   t | ttfr| D ]}t|st|s dS q	dS t| r"| jdkr$dS dS )NF   T)r0   r:   r;   r?   r4   r2   )videosZvideo_or_framer-   r-   r.   valid_videosY   s   rB   c                 C   s<   t | ttfrt| d S t| st| r| jdkrdS dS )Nr   r@   TF)r0   r:   r;   r?   r   r   r2   rA   r-   r-   r.   is_batched_videoe   s
   rD   r>   returnc                 C   s   t | dkot | dkS )zV
    Checks to see whether the pixel values have already been rescaled to [0, 1].
    r   r   )npminmaxr=   r-   r-   r.   is_scaled_videom   s   rI   rA   )r   r    c                 C   sJ   t | d ttfs| S g }| D ]}dd |D }t|}|| q|S )aK  
    Given a batch of videos, converts each video to a 4D array. If video is already in array type,
    it is simply returned. We assume that all inputs in the list are in the same format, based on the type of the first element.

    Args:
        videos (`VideoInput`):
            Video inputs to turn into a list of videos.
    r   c                 S   s   g | ]}t |qS r-   )rF   arrayr7   r-   r-   r.   
<listcomp>   s    z/convert_pil_frames_to_video.<locals>.<listcomp>)r0   r:   r;   rF   stackappend)rA   Zvideo_convertedr>   r-   r-   r.   convert_pil_frames_to_videou   s   

rN   c                 C   s   t stdt|  dt| r	 t| S t| r | g} t| S t| r0t| d g} t| S t| d t	t
frHt| d d rHdd | D S t| S )a  
    Ensure that the input is a list of videos. If the input is a single video, it is converted to a list of length 1.
    If the input is a batch of videos, it is converted to a list of 4D video arrays. Videos passed as list `PIL.Image`
    frames are converted to 4D arrays.

    We assume that all inputs in the list are in the same format, based on the type of the first element.

    Args:
        videos (`VideoInput`):
            Video inputs to turn into a list of videos.
    zkInvalid video input. Expected either a list of video frames or an input of 4 or 5 dimensions, but got type .)N.r   c                 S   s   g | ]	}|D ]}|qqS r-   r-   )r8   Zsublistr>   r-   r-   r.   rK      s    z'make_batched_videos.<locals>.<listcomp>)rB   
ValueErrortyperD   r?   r   rF   rJ   r0   r:   r;   rN   rC   r-   r-   r.   make_batched_videos   s$   	"rR   channel_dimc                 C   sZ   |du rt | }|tjkr| jd | jd fS |tjkr&| jd | jd fS td| )a  
    Returns the (height, width) dimensions of the video.

    Args:
        video (`np.ndarray`):
            The video to get the dimensions of.
        channel_dim (`ChannelDimension`, *optional*):
            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the video.

    Returns:
        A tuple of the video's height and width.
    NzUnsupported data format: )r   r   FIRSTshapeZLASTrP   )r>   rS   r-   r-   r.   get_video_size   s   

rY   r"   
num_framesc                 C   s:   |durt d| | | t}|S t d| t}|S )a  
    Creates a numpy array for uniform sampling of `num_frame` frames from `total_num_frames`
    when loading a video.

    Args:
        total_num_frames (`int`):
            Total number of frames that a video has.
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly. If not specified, all frames are sampled.

    Returns:
        np.ndarray: np array of frame indices that will be sampled.
    Nr   )rF   arangeZastyper)   )r"   rZ   indicesr-   r-   r.   get_uniform_frame_indices   s
   r]   metadatac                 K   s   | j }| j}|du r(|dur(t|| | }||kr(td| d| d| d|dur9tjd||| td}|S tjd|td}|S )a`  
    A default sampling function that replicates the logic used in get_uniform_frame_indices,
    while optionally handling `fps` if `num_frames` is not provided.

    Args:
        metadata (`VideoMetadata`):
            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly.
        fps (`int`, *optional*):
            Desired frames per second. Takes priority over num_frames if both are provided.

    Returns:
        `np.ndarray`: Array of frame indices to sample.
    Nz When loading the video with fps=z, we computed num_frames=z  which exceeds total_num_frames=z. Check fps or video metadata.r   )Zdtype)r"   r#   r)   rP   rF   r[   )r^   rZ   r#   kwargsr"   	video_fpsr\   r-   r-   r.   default_sample_indices_fn   s   ra   
video_pathsample_indices_fnc                 K   s  t tdg ddl}|| }t||j}||j}|r#|| nd}tt|t	|t	|dd}|dd|i|}	d}
g }|
 r}| \}}|sMn0|
|	v rn|j\}}}|||j}||d|d|d|f  |rt|
d7 }
|
|kryn|
 sD|  |	|_t||fS )	av  
    Decode a video using the OpenCV backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    cv2r   Nopencvr"   r#   r$   r%   r^   r   r-   )r   read_video_opencvrd   ZVideoCapturer)   getZCAP_PROP_FRAME_COUNTZCAP_PROP_FPSr!   r+   ZisOpenedreadrX   ZcvtColorZCOLOR_BGR2RGBrM   releaseframes_indicesrF   rL   )rb   rc   r_   rd   r>   r"   r`   r$   r^   r\   indexframessuccessr3   heightwidthZchannelr-   r-   r.   rg      s8   
 rg   c                 K   s   t tdg ddlm}m} || |dd}| }t|}|r$|| nd}tt|t	|t	|dd}	|dd|	i|}
|
|
 }|
|	_||	fS )	a  
    Decode a video using the Decord backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    decordr   )VideoReadercpu)urictxrf   r^   Nr-   )r   read_video_decordrq   rr   rs   Zget_avg_fpslenr!   r)   r+   Z	get_batchZasnumpyrk   )rb   rc   r_   rr   rs   Zvrr`   r"   r$   r^   r\   rm   r-   r-   r.   rv   2  s   rv   c                 K   s   t tdg ddl}|| }|jjd j}|jjd j}|r#|| nd}tt	|t
|t
|dd}|dd|i|}	g }
|d |	d }t|jddD ]\}}||krY n|dkrf||	v rf|
| qOtd	d
 |
D }|	|_||fS )a}  
    Decode the video with PyAV decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    avr   Npyavrf   r^   rU   r=   c                 S   s   g | ]}|j d dqS )Zrgb24)format)Z
to_ndarray)r8   xr-   r-   r.   rK     s    z#read_video_pyav.<locals>.<listcomp>r-   )r   read_video_pyavrx   openstreamsr>   rm   Zaverage_rater!   r)   r+   seek	enumeratedecoderM   rF   rL   rk   )rb   rc   r_   rx   	containerr"   r`   r$   r^   r\   rm   Z	end_indexir3   r>   r-   r-   r.   r|   ]  s,   


r|   c                 K   s   t j| ddddd\}}}|d }|d}|r|| nd}tt|t|t|dd	}	|dd
|	i|}
||
   }|
|	_||	fS )a  
    Decode the video with torchvision decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
            NsecZTHWC)Z	start_ptsZend_ptsZpts_unitZoutput_formatr`   r   torchvisionrf   r^   r-   )	torchvision_ioZ
read_videosizer!   r)   r+   
contiguousnumpyrk   )rb   rc   r_   r>   _infor`   r"   r$   r^   r\   r-   r-   r.   read_video_torchvision  s(   
r   )rq   re   ry   r   ry   
VideoInputr#   backendc              	      s   durdur|du rt d|du r fdd}|}t| jdv rpt s+tdttdg dd	lm} t	 }t
|! | }	|	| g W d   n1 sSw   Y  W d   n1 sbw   Y  | }
t	|
}n4| d
sz| drt	t| j}n!tj| r| }nt| st| ttfrt| d rd}ntd| d
p| d}|r|dv rt d|du r| S t s|dkst s|dkst s|dkst s|dkrtd| d| dt| }|||fi |\} }| |fS )a  
    Loads `video` to a numpy array.

    Args:
        video (`str` or `VideoInput`):
            The video to convert to the numpy array format. Can be a link to video or local path.
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly. If not passed, the whole video is loaded.
        fps (`int`, *optional*):
            Number of frames to sample per second. Should be passed only when `num_frames=None`.
            If not specified and `num_frames==None`, all frames are sampled.
        backend (`str`, *optional*, defaults to `"pyav"`):
            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav".
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
            indices at which the video should be sampled. For example:

            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, Dict]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - Metadata dictionary.
    Nzc`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!c                    s   t | f d|S )N)rZ   r#   )ra   )r^   Z	fn_kwargsr#   rZ   r-   r.   sample_indices_fn_func  s   z*load_video.<locals>.sample_indices_fn_func)zwww.youtube.comzyoutube.comzETo load a video from YouTube url you have  to install `yt_dlp` first.yt_dlpr   )	YoutubeDLzhttp://zhttps://zVIncorrect format used for video. Should be an url linking to an video or a local path.)re   r   zlIf you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backendrq   ry   re   r   zYou chose backend=zf for loading the video but the required library is not found in your environment Make sure to install z before loading the video.)rP   r   netlocr   ImportErrorr   
load_videor   r   r   r   downloadgetvalue
startswithrequestsrh   contentospathisfiler   r0   r:   r;   	TypeErrorr   r   r   r   VIDEO_DECODERS)r>   rZ   r#   r   rc   r_   r   r   bufferfZ	bytes_objZfile_objZvideo_is_urlZvideo_decoderr^   r-   r   r.   r     sl   ' 
"r   data_formatinput_data_formatc                 C   s   t | tjstdt|  |du rt| }t| tj|d} | j	d dkr(| S | j	d dkr5| 
ddS | ddddddf dk  sG| S | ddddddf d	 }d|ddddddf  d |ddddddf | ddddddf   } | S )
a  
    Convert video to RGB by blending the transparency layer if it's in RGBA format, otherwise simply returns it.

    Args:
        video (`np.array`):
            The video to convert.
        data_format (`ChannelDimension`, *optional*):
            The channel dimension format of the output video. If unset, will use the inferred format from the input.
        input_data_format (`ChannelDimension`, *optional*):
            The channel dimension format of the input video. If unset, will use the inferred format from the input.
    zBVideo has to be a numpy array to convert to RGB format, but found N)Zinput_channel_dimrV   r/   r   .   g     o@)r0   rF   ndarrayrP   rQ   r   r   r   rW   rX   repeatany)r>   r   r   alphar-   r-   r.   convert_to_rgb,  s    Pr   r   paddingmodeconstant_valuesc           	         s    du rt   fdd}tjdtjdtjdtjdi}||}i }||vr.td| |tjkr9|||d	< tj|fd
|| i||durSt	| S S )a  
    Pads the `video` with the specified (height, width) `padding` and `mode`.

    Args:
        video (`np.ndarray`):
            The video to pad.
        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
            Padding to apply to the edges of the height, width axes. Can be one of three formats:
            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
            - `((before, after),)` yields same before and after pad for height and width.
            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
        mode (`PaddingMode`):
            The padding mode to use. Can be one of:
                - `"constant"`: pads with a constant value.
                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
                  vector along each axis.
                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
        constant_values (`float` or `Iterable[float]`, *optional*):
            The value to use for the padding if `mode` is `"constant"`.
        data_format (`str` or `ChannelDimension`, *optional*):
            The channel dimension format for the output video. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format.
            If unset, will use same as the input video.
        input_data_format (`str` or `ChannelDimension`, *optional*):
            The channel dimension format for the input video. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format.
            If unset, will use the inferred format of the input video.

    Returns:
        `np.ndarray`: The padded video.

    Nc                    s  t | ttfr| | f| | ff} nOt | tr,t| dkr,| d | d f| d | d ff} n3t | trCt| dkrCt | d trC| | f} nt | trXt| dkrXt | d trX| } ntd|   tjkrkddg| R ndg| dR } jdkrdg| R } | S | } | S )za
        Convert values to be in the format expected by np.pad based on the data format.
        r   r      zUnsupported format: )r   r   r@   )	r0   r)   r+   r;   rw   rP   r   rW   r2   )valuesr   r>   r-   r.   _expand_for_data_format  s   "$
$&z$pad.<locals>._expand_for_data_formatZconstantZreflectZ	replicateZ	symmetriczInvalid padding mode: r   r   )
r   r   CONSTANTZREFLECTZ	REPLICATEZ	SYMMETRICrP   rF   padr   )	r>   r   r   r   r   r   r   Zpadding_mapZ
pad_kwargsr-   r   r.   r   V  s&   +
r   c                 C   sx   i }i }t | D ]&\}}|jdd }||vrg ||< || | |t|| d f||< qdd | D }||fS )a  
    Groups videos by shape.
    Returns a dictionary with the shape as key and a list of videos with that shape as value,
    and a dictionary with the index of the video in the original list as key and the shape and index in the grouped list as value.
    rT   Nr   c                 S   s    i | ]\}}|t j|d dqS )r   )dim)torchrL   )r8   rX   rA   r-   r-   r.   
<dictcomp>  s     z)group_videos_by_shape.<locals>.<dictcomp>)r   rX   rM   rw   items)rA   Zgrouped_videosgrouped_videos_indexr   r>   rX   r-   r-   r.   group_videos_by_shape  s   r   processed_videosr   c                    s    fddt t D S )z>
    Reconstructs a list of videos in the original order.
    c                    s(   g | ]} | d    | d  qS )r   r   r-   )r8   r   r   r   r-   r.   rK     s    z"reorder_videos.<locals>.<listcomp>)rangerw   )r   r   r-   r   r.   reorder_videos  s   
r   r6   )NN)NNry   N)Mr   
contextlibr   dataclassesr   r   r   typingr   r   r   r   r	   r
   r   urllib.parser   r   rF   r   Zimage_transformsr   r   Zimage_utilsr   r   r   utilsr   r   r   r   r   r   r   r   r   r   r   Z	PIL.Imager1   ZPIL.ImageOpsr   r   r   Z
get_loggerr&   loggerr   r!   r4   r?   rB   rD   r   boolrI   rN   rR   r)   rY   r]   ra   r,   rg   rv   r|   r   r   rJ   r   r   r   r+   r   r   r   r-   r-   r-   r.   <module>   s   $4



"#
;
+
3
1


f
- 
Y6
