o
    Zh&                     @   s   d dl mZmZ ddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZ e	 r5d dlmZ ddlmZ e r>dd	lmZ e rKd d
lZddlmZ e
eZeedddG dd deZd
S )    )ListUnion   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)Zhas_tokenizerZhas_image_processorc                       sz   e Zd ZdZdZeddZ fddZddd	Zdd
e	e
ee
 ded f f fddZdddZdd Zdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    T   )max_new_tokensc                    s>   t  j|i | t| d | | jdkrt d S t d S )NZvisiontf)super__init__r   Zcheck_model_type	frameworkr   r   )selfargskwargs	__class__ S/var/www/auris/lib/python3.10/site-packages/transformers/pipelines/image_to_text.pyr   P   s   
zImageToTextPipeline.__init__Nc                 C   s   i }i }|d ur||d< |d ur||d< |d ur||d< |d ur1|d ur,d|v r,t d|| | jd ur;| j|d< | jd urJ| j|d< | j|d< ||i fS )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater#   r%   r$   )r   r   generate_kwargsr!   r"   Zforward_paramsZpreprocess_paramsr   r   r    _sanitize_parametersW   s(   






z(ImageToTextPipeline._sanitize_parametersinputszImage.Imagec                    s6   d|v r	| d}|du rtdt j|fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        imagesNzBCannot call the image-to-text pipeline without an inputs argument!)popr&   r   __call__)r   r*   r   r   r   r    r-   r   s
   
zImageToTextPipeline.__call__c                 C   s  t ||d}|d urtd t|tstdt| d| jjj	}|dkrY| j
|| jd}| jdkr:|| j}| j|dd	j}| jjg| }t|d
}|d|i nY|dkrr| j
||| jd}| jdkrq|| j}n@|dkr| j
|| jd}| jdkr|| j}| j|| jd}|| ntd| d| j
|| jd}| jdkr|| j}| jjj	dkr|d u rd |d< |S )N)r"   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r+   return_tensorsptF)textZadd_special_tokensr   	input_idsZ
pix2struct)r+   Zheader_textr/   zvision-encoder-decoder)r/   zModel type z- does not support conditional text generation)r   loggerZwarning_once
isinstancestrr&   typemodelconfig
model_typeZimage_processorr   toZtorch_dtyper$   r2   Zcls_token_idtorchZtensorZ	unsqueezer'   )r   imager!   r"   r9   model_inputsr2   Ztext_inputsr   r   r    
preprocess   sH   





zImageToTextPipeline.preprocessc                 K   sp   d|v rt |d trtdd |d D rd |d< d|vr#| j|d< || jj}| jj|fi ||}|S )Nr2   c                 s   s    | ]}|d u V  qd S Nr   ).0xr   r   r    	<genexpr>   s    z/ImageToTextPipeline._forward.<locals>.<genexpr>generation_config)r4   listallrC   r,   r7   Zmain_input_namegenerate)r   r=   r(   r*   model_outputsr   r   r    _forward   s   
zImageToTextPipeline._forwardc                 C   s0   g }|D ]}d| j j|ddi}|| q|S )NZgenerated_textT)Zskip_special_tokens)r$   decodeappend)r   rG   recordsZ
output_idsrecordr   r   r    postprocess   s   zImageToTextPipeline.postprocess)NNNNr?   )NN)__name__
__module____qualname____doc__Z_pipeline_calls_generater   Z_default_generation_configr   r)   r   r5   r   r-   r>   rH   rM   __classcell__r   r   r   r    r   .   s    
(
$3r   )typingr   r   Z
generationr   utilsr   r   r   r	   r
   r   baser   r   ZPILr   Zimage_utilsr   Zmodels.auto.modeling_tf_autor   r;   Zmodels.auto.modeling_autor   Z
get_loggerrN   r3   r   r   r   r   r    <module>   s    
