o
    Zh(                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ dd	lmZ d
dlmZ eeZG dd de
ddZG dd deZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)ListUnion   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncodingPreTokenizedInput	TextInput)logging   )AutoTokenizerc                
   @   s*   e Zd Zdddddddddd	i dZdS )InstructBlipProcessorKwargsTFr   )	add_special_tokenspaddingZstrideZreturn_overflowing_tokensZreturn_special_tokens_maskZreturn_offsets_mappingZreturn_token_type_idsZreturn_lengthverbose)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults r   r   g/var/www/auris/lib/python3.10/site-packages/transformers/models/instructblip/processing_instructblip.pyr   &   s    
r   F)totalc                
       s   e Zd ZdZg dZdgZdZdZdZd fdd	Z					dd	e
d
eeeee ee f dee defddZdd Zdd Zedd Z fddZe fddZ  ZS )InstructBlipProcessora  
    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.

    Args:
        image_processor (`BlipImageProcessor`):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):"
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )image_processor	tokenizerqformer_tokenizernum_query_tokens)ZBlipImageProcessorZBlipImageProcessorFastr   Nc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| d S )Nimage_tokenz<image>FT)
normalizedZspecial)Zspecial_tokens)hasattrr
   r#   Z
add_tokensr"   super__init__)selfr   r    r!   r"   kwargs	__class__r   r   r'   P   s   
zInstructBlipProcessor.__init__imagestextr)   returnc                 K   s  |du r|du rt d| jtfd| jji|}t }|durt|tr)|g}nt|ts9t|d ts9t d|d 	dd}| j|fi |d ddi}	||d d< | j
dur|duri }
| jj| j
 }| j|gt| ddd	}|	D ]}d
d t|| |	| D |
|< qwn|	}
|durtd t|
|d}
||
 | j|fi |d }|	d|d< |	d|d< |dur| j|fi |d }|| |S )a  
        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Args:
            images (`ImageInput`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        Nz,You have to specify at least images or text.Ztokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   return_tensorsF)r   r/   c                 S   s   g | ]\}}|| qS r   r   ).0Zimg_encodingZtxt_encodingr   r   r   
<listcomp>   s    z2InstructBlipProcessor.__call__.<locals>.<listcomp>aA  Expanding inputs for image tokens in InstructBLIP should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.)Ztensor_typeZ	input_idsZqformer_input_idsZattention_maskZqformer_attention_maskr   )
ValueErrorZ_merge_kwargsr   r    Zinit_kwargsr   
isinstancestrlistpopr"   r#   contentlenziploggerZwarning_oncer   updater!   r   )r(   r,   r-   ZaudioZvideosr)   Zoutput_kwargsencodingr/   Z_text_encodingtext_encodingZimage_tokensZimage_token_encodingkZqformer_text_encodingZimage_encodingr   r   r   __call__Y   sV   


zInstructBlipProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r    batch_decoder(   argsr)   r   r   r   rA         z"InstructBlipProcessor.batch_decodec                 O   r@   )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r    decoderB   r   r   r   rE      rD   zInstructBlipProcessor.decodec                 C   s"   | j j}| jj}tt|| S N)r    model_input_namesr   r5   dictfromkeys)r(   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   rG      s   z'InstructBlipProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r/| j	d t
 j|fi |}|rC|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr!   )ospathisfiler2   makedirsjoinr!   save_pretrained
attributesremover&   )r(   Zsave_directoryr)   Zqformer_tokenizer_pathZqformer_presentZoutputsr*   r   r   rP      s   
z%InstructBlipProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr|d }tj|dd}||_|S )Nr   r!   )Z	subfolder)r&   from_pretrainedr3   tupler   r!   )clsZpretrained_model_name_or_pathr)   	processorr!   r*   r   r   rS      s   
z%InstructBlipProcessor.from_pretrainedrF   )NNNN)r   r   r   __doc__rQ   Zvalid_kwargsZimage_processor_classZtokenizer_classZqformer_tokenizer_classr'   r   r   r   r   r   r	   r   r   r?   rA   rE   propertyrG   rP   classmethodrS   __classcell__r   r   r*   r   r   7   s8    
P
r   )rW   rK   typingr   r   Zimage_processing_utilsr   Zimage_utilsr   Zprocessing_utilsr   r   r	   Ztokenization_utils_baser
   r   r   r   utilsr   autor   Z
get_loggerr   r:   r   r   __all__r   r   r   r   <module>   s   
 
)