o
    ZhY                  
   @   s.  d dl Z d dlmZ d dlmZmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZ dd	lmZmZ e rKd d
lmZ ddlmZmZ e rZddlmZ ddlmZ ee Z!dZ"G dd de j#Z$G dd dZ%de&deee'ee' ded f  fddZ(eeddG dd deZ)dS )    N)Iterable)DictListOptionalUnion   )GenerationConfig)ProcessingKwargsUnpack)add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)load_imagesvalid_images)*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
KeyDatasetz<image>c                   @   s   e Zd ZdZdZdZdS )
ReturnTyper   r   r   N)__name__
__module____qualname__TENSORSNEW_TEXT	FULL_TEXT r   r   X/var/www/auris/lib/python3.10/site-packages/transformers/pipelines/image_text_to_text.pyr   /   s    r   c                   @   s>   e Zd ZdZ	ddedeeeee ded f  fddZ	dS )	Chata   This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
    to this format because the rest of the pipeline code tends to assume that lists of messages are
    actually a batch of samples rather than messages in the same conversation.NmessagesimagesImage.Imagec                 C   s6   |D ]}d|v rd|v st dqt||}|| _d S )NrolecontentzQWhen passing chat dicts as input, each dict must have a 'role' and 'content' key.)
ValueErroradd_images_to_messagesr"   )selfr"   r#   messager   r   r    __init__:   s   

zChat.__init__N)
r   r   r   __doc__r   r   r   strr   r+   r   r   r   r    r!   5   s    r!   r"   r#   r$   c                    s  |du rg }nt |trt |tr|g}d}| D ]_}|d D ]X t  ts&q d}|dkrPt fdddD sO|t|k rK||  d< |d	7 }qtd
q|dkrvt  dtrrd d v rrd d<  d d  d<  d= qtdqq|t|krtd
| S )zS
    Retrieve and combine images from the chat and the images passed as input.
    Nr   r&   typeimagec                 3   s    | ]}| v V  qd S r,   r   ).0keyr&   r   r    	<genexpr>V   s    z)add_images_to_messages.<locals>.<genexpr>)r0   urlpathbase64r   zlThe number of images in the chat messages should be the same as the number of images passed to the pipeline.Z	image_urlr5   zhWrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key.)
isinstancer   r.   dictgetanylenr'   )r"   r#   Z
idx_imagesr*   content_typer   r3   r    r(   E   sD   



r(   T)Zhas_processorc                       s   e Zd ZdZdZdZdZdZdZe	ddZ
 fddZ									dd	ee fd
dZ		ddeeeee eee  ded eed  ee f  deeeee ee f  f fddZdddZdddZejdfddZ  ZS )ImageTextToTextPipelinea2
  
    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
    When the underlying model is a conversational model, it can also accept one or more chats,
    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
    [{'generated_text': 'a photo of two birds'}]
    ```

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
    >>> messages = [
    >>>     {
    >>>         "role": "user",
    >>>         "content": [
    >>>             {
    >>>                 "type": "image",
    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
    >>>             },
    >>>             {"type": "text", "text": "Describe this image."},
    >>>         ],
    >>>     },
    >>>     {
    >>>         "role": "assistant",
    >>>         "content": [
    >>>             {"type": "text", "text": "There is a dog and"},
    >>>         ],
    >>>     },
    >>> ]
    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
    [{'input_text': [{'role': 'user',
        'content': [{'type': 'image',
        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
        {'type': 'text', 'text': 'Describe this image.'}]},
    {'role': 'assistant',
        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-text-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
    TF   )max_new_tokensc                    s*   t  j|i | t| d | t d S )NZvision)superr+   r   Zcheck_model_typer   )r)   argskwargs	__class__r   r    r+      s   
z ImageTextToTextPipeline.__init__NrC   c
                 K   s:  i }i }i }| |
 |d ur||d< |	d ur|	|d< |d ur#||d< |d ur?d|vr/i |d< d|d v r9td||d d< |d urW|d u rW|d urOtd|rTtjntj}|d urb|d u rbtj}|d urj||d< |	d urr|	|d< |d urz||d< |d ur| jjj|d	d
}t	|dkrt
d |d |d< |||fS )Ntimeoutcontinue_final_messagegenerate_kwargsr@   zp'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter, please use only onez>`return_full_text` is mutually exclusive with `return_tensors`return_typeclean_up_tokenization_spacesF)Zadd_special_tokensr   zStopping on a multiple token sequence is not yet supported on transformers. The first token of the stop sequence will be used as the stop sequence string in the interim.r   Zeos_token_id)updater'   r   r   r   r   	processorZ	tokenizerencoder<   loggerwarning_once)r)   r@   rH   rF   Zreturn_full_textreturn_tensorsrI   rJ   Zstop_sequencerG   rC   Zforward_kwargsZpreprocess_paramsZpostprocess_paramsZstop_sequence_idsr   r   r    _sanitize_parameters   sL   

z,ImageTextToTextPipeline._sanitize_parametersr#   r$   textc                    sv  |du r|du rt ddd }||rGt|d tr(t jt||fi |S |du r3dgt| }dd t||D }t j|fi |S |du rs||rst|d trbt jt|fi |S dd |D }t j|fi |S |dur|du rt|s	 t j|fi |S t	| j
d	ddurtd
 |du rt j|fi |S |du rt dt j||dfi |S )a3  
        Generate a text given text and the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image, `List[PIL.Image]`, `List[Dict[str, Union[str, PIL.Image]]]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images. Finally, this pipeline also supports
                the chat format (see `text`) containing images and text in this argument.
            text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
                The text to be used for generation. If a list of strings is passed, the length of the list should be
                the same as the number of images. Text can also follow the chat format: a list of dictionaries where
                each dictionary represents a message in a conversation. Each dictionary should have two keys: 'role'
                and 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of
                dictionary containing the text of the message and the type of the message. The type of the message
                can be either 'text' or 'image'. If the type is 'image', no text is needed.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Returns the tensors of predictions (as token indices) in the outputs. If set to
                `True`, the decoded text is not returned.
            return_text (`bool`, *optional*):
                Returns the decoded texts in the outputs.
            return_full_text (`bool`, *optional*, defaults to `True`):
                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
                specified at the same time as `return_text`.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                Whether or not to clean up the potential extra spaces in the text output.
            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
                By default this is `True` when the final message in the input chat has the `assistant` role and
                `False` otherwise, but you can manually override that behaviour by setting this flag.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot
            return a combination of both `generated_text` and `generated_token_ids`):

            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
                ids of the generated text.
            - **input_text** (`str`) -- The input text.
        Nz0You must at least provide either text or images.c                 S   s$   t | tttfot | d tttfS )Nr   )r8   listtupler   r9   )argr   r   r    _is_chat=  s   $z2ImageTextToTextPipeline.__call__.<locals>._is_chatr   c                 S   s   g | ]	\}}t ||qS r   r!   )r1   Zchatr0   r   r   r    
<listcomp>G  s    z4ImageTextToTextPipeline.__call__.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   rW   )r1   r0   r   r   r    rX   Q  s    Zchat_templatea
  The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templatingz(You must provide text for this pipeline.)r#   rR   )r'   r8   r9   rA   __call__r!   r<   zipr   getattrrL   rN   rO   )r)   r#   rR   rC   rV   ZchatsrD   r   r    rY      s6   <z ImageTextToTextPipeline.__call__c           	      K   s   t |tr'|d u r|jd d dk}| jj|j| || jddd}||d< |S t |tttfr6d }|}|}nt	|d |d}|d }|d }t |ttfrYt
|d	krY|d
d | jd||| jd|j| jd}||d< |S )Nr%   	assistantT)Zadd_generation_promptrG   rP   tokenizeZreturn_dictrR   r#   )rF   r   padding)r#   rR   rP   )Zdtyper   )r8   r!   r"   rL   Zapply_chat_templateZ	frameworkrS   rT   r.   r   r<   
setdefaulttoZtorch_dtype)	r)   ZinputsrF   rG   Zprocessing_kwargsmodel_inputsr#   rR   Zinputs_textr   r   r    
preprocessn  s6   
z"ImageTextToTextPipeline.preprocessc                 C   sT   |d u ri n|}| d}d|v r|d n|d }| jjdi ||}|||dS )NrR   	input_idsZdecoder_input_ids)generated_sequenceprompt_textrd   r   )popmodelgenerate)r)   rb   rH   rf   rd   re   r   r   r    _forward  s   
z ImageTextToTextPipeline._forwardc                    s  |d t ttfrgn|d  |d }|tjkr+ fddttD S | jj fi |}| jj|fi |}|tj	tj
hv ryg }t||D ](\}	}
|	|
}d|  kradkrqn n||	|t|
 d   qN||	 qN|}|tj
krg }t|D ]h\}}t |tr|| }nUt |tr|d u r|jd d	 d
k}|rt|jd d d  }|d  |7  < t|jd d |jd d	 |jd d d d |g dg }nt|jd
|dg }|| q|}dd t|D }|S )Nrf   re   rd   c                    s   g | ]}|  | d qS ))
input_textZgenerated_token_idsr   )r1   ire   Zinput_textsr   r    rX     s    z7ImageTextToTextPipeline.postprocess.<locals>.<listcomp>r   r   r\   r%   r]   r&   rR   )r%   r&   c                 S   s*   g | ]\}}t |tr|jn||d qS ))rk   generated_text)r8   r!   r"   )r1   rk   rn   r   r   r    rX     s    )r8   r.   r!   r   r   ranger<   rL   Zpost_process_image_text_to_textr   r   rZ   findappendr"   r9   itemsrS   )r)   Zmodel_outputsrI   rG   Zpostprocess_kwargsrd   Zgenerated_textsZdecoded_inputsZnew_generated_textsZtext_generatedZdecoded_inputZindex_input_textZ
full_textsrf   rn   new_textrecordsr   rm   r    postprocess  sV   






z#ImageTextToTextPipeline.postprocess)	NNNNNNNNN)NN)NNNr,   )r   r   r   r-   Z_load_processorZ_load_image_processorZ_load_feature_extractorZ_load_tokenizerZ_pipeline_calls_generater   Z_default_generation_configr+   r
   r	   rQ   r   r   r.   r   r9   rY   rc   rj   r   r   ru   __classcell__r   r   rD   r    r>   t   sZ    =
G


p
%r>   )*enumcollections.abcr   typingr   r   r   r   Z
generationr   Zprocessing_utilsr	   r
   utilsr   r   r   r   r   baser   r   ZPILr   Zimage_utilsr   r   Zmodels.auto.modeling_autor   Zpt_utilsr   Z
get_loggerr   rN   ZIMAGE_TOKENEnumr   r!   r9   r.   r(   r>   r   r   r   r    <module>   s0   

/