o
    Zh+                     @   s   d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZmZ dd	lmZ d
dlmZ eeZG dd de	ZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)ListOptionalUnion   )BatchFeature)ProcessorMixin)
AddedTokenBatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging)
VideoInput   )AutoTokenizerc                $       s$  e Zd ZdZg dZdgZdZdZdZd) fdd	Z						
						
	
	
	
	
			d*de
deeeee ee f dedeeeef deeeef dee dedee dee dededededededeeeef  def"ddZdd  Zd!d" Zed#d$ Z fd%d&Ze fd'd(Z  ZS )+InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        video_processor (`InstructBlipVideoVideoProcessor`):
            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )video_processor	tokenizerqformer_tokenizernum_query_tokensZAutoVideoProcessorr   Nc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| d S )Nvideo_tokenz<video>FT)
normalizedZspecial)Zspecial_tokens)hasattrr   r   Z
add_tokensr   super__init__)selfr   r   r   r   kwargs	__class__ q/var/www/auris/lib/python3.10/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   A   s   
z#InstructBlipVideoProcessor.__init__TFr   imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                 K   s  |du r|du rt dt }|durt|tr|g}nt|ts,t|d ts,t d| jd||||||||	|
|||||dd|}| jdur{|dur{i }| jj| j d }| j|gt	| ddd}|D ]}d	d
 t
|| || D ||< qgn|}|durtd t||d}|| | jd||||||||	|
||||||d|}|d|d< |d|d< |dur| j||d}|| |S )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2      F)r%   r2   c                 S   s   g | ]\}}|| qS r!   r!   ).0Zimg_encodingZtxt_encodingr!   r!   r"   
<listcomp>   s    z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>aK  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.)Ztensor_typeZ	input_idsZqformer_input_idsZattention_maskZqformer_attention_mask)r2   r!   )
ValueErrorr   
isinstancestrlistr   r   r   contentlenziploggerZwarning_oncer	   updater   popr   )r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r   encodingZ_text_encodingtext_encodingZvideo_tokensZvideo_token_encodingkZqformer_text_encodingZimage_encodingr!   r!   r"   __call__J   s   


z#InstructBlipVideoProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r   batch_decoder   argsr   r!   r!   r"   rF         z'InstructBlipVideoProcessor.batch_decodec                 O   rE   )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoderG   r!   r!   r"   rJ      rI   z!InstructBlipVideoProcessor.decodec                 C   s"   | j j}| jj}tt|| S N)r   model_input_namesZimage_processorr:   dictfromkeys)r   Ztokenizer_input_namesZimage_processor_input_namesr!   r!   r"   rL      s   z,InstructBlipVideoProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r/| j	d t
 j|fi |}|rC|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfiler7   makedirsjoinr   save_pretrained
attributesremover   )r   Zsave_directoryr   Zqformer_tokenizer_pathZqformer_presentZoutputsr   r!   r"   rU      s   
z*InstructBlipVideoProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr|d }tj|dd}||_|S )Nr   r   )Z	subfolder)r   from_pretrainedr8   tupler   r   )clsZpretrained_model_name_or_pathr   	processorr   r   r!   r"   rX      s   
z*InstructBlipVideoProcessor.from_pretrainedrK   )NNTFNNr   NNFFFFFTN) __name__
__module____qualname____doc__rV   Zvalid_kwargsZvideo_processor_classZtokenizer_classZqformer_tokenizer_classr   r   r   r   r   r   boolr9   r
   r   r   intr   r   rD   rF   rJ   propertyrL   rU   classmethodrX   __classcell__r!   r!   r   r"   r   (   s    	

o
r   )r_   rP   typingr   r   r   Zimage_processing_utilsr   Zprocessing_utilsr   Ztokenization_utils_baser   r	   r
   r   r   r   utilsr   r   Zvideo_utilsr   autor   Z
get_loggerr\   r>   r   __all__r!   r!   r!   r"   <module>   s    
 
H