o
    Zh|                  
   @   s  d Z ddlZddlZddlZddlmZmZmZmZ ddl	m
Z
 ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ eeeeef  eeeeeef  eeeeef   eeeeeef   f ZG d
d deddZG dd deddZG dd deddZG dd deZdeeeeef dedeeef fddZ dededefddZ!dd Z"dd  Z#d!d" Z$d&d$d%Z%dgZ&dS )'zProcessor class for KOSMOS-2.    N)ListOptionalTupleUnion   )BatchFeature)
ImageInput
is_batched)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInputc                   @   s6   e Zd ZU eee  ed< ee ed< ee ed< dS )Kosmos2ImagesKwargsbboxesnum_image_tokensfirst_image_token_idN)__name__
__module____qualname__r   r   float__annotations__int r   r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   %   s   
 r   F)totalc                   @   s   e Zd ZU ee ed< dS )Kosmos2TextKwargsadd_eos_tokenN)r   r   r   r   boolr   r   r   r   r   r   +   s   
 r   c                
   @   s@   e Zd ZU eed< eed< dddddddddd	ddid	Zd
S )Kosmos2ProcessorKwargstext_kwargsimages_kwargsTFr   )	add_special_tokenspaddingZstrideZreturn_overflowing_tokensZreturn_special_tokens_maskZreturn_offsets_mappingreturn_token_type_idsverboser    r   @   )r#   r$   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r"   /   s    
 
r"   c                       sR  e Zd ZdZddgZdgZdZdZd+ fdd		Z	
	
	
	
d,de	de
eee f dee defddZdd Zdd Z	
	
	d-de
eee f de	dedee de
eee f f
ddZdd Zdd Zd.d d!Zd.d"d#Zed$d% Zdede
eee  eee  f defd&d'Z d(e
eeef eeeeef f deeef fd)d*Z!  Z"S )/Kosmos2Processora,  
    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
    processor.

    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
    for more information.

    Args:
        image_processor (`CLIPImageProcessor`):
            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
        tokenizer (`XLMRobertaTokenizerFast`):
            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
    image_processor	tokenizernum_patch_index_tokens)ZCLIPImageProcessorZCLIPImageProcessorFastZAutoTokenizer   c                    s   d|_ d| _d| _d| _d| _d| _d| _d| _d	| _d
| _	d| _
d| _| j| j| j| j| j| j| j| j| j	| j
| jg| _|| _dd t| jD }g }| j| D ]}|t|dddd qQ|| t || d S )NFz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding>c                 S   s"   g | ]}d t |d dqS )<patch_index_   >)strzfill.0xr   r   r   
<listcomp>   s   " z-Kosmos2Processor.__init__.<locals>.<listcomp>T)lstriprstrip
normalized)r'   Z	eod_token	boi_token	eoi_tokenZ	eoc_tokenZ	eol_tokenZ	bop_tokenZ	eop_tokenZ	boo_tokenZ	eoo_tokenZ	dom_tokenZ	grd_tokenZ
tag_tokensr.   rangeappendr   Z
add_tokenssuper__init__)selfr,   r-   r.   kwargsZpatch_index_tokensZtokens_to_addtoken	__class__r   r   rB   [   s>   
zKosmos2Processor.__init__NimagestextrD   returnc                    sJ  |du r|du rt djtfdjji|}|d dd}|d dd}|d dd}	|d	 d
d}
|d	 d }|d	 d }|d	 dd}t }|durdj|fi |d }|	| |durƈj
||||d}|r|
st|trjj | }nt|trfdd|D }|d	 d o|
|d	 d< |du r|nd|d	 d< |du r|nd|d	 d< jd&d|i|d	 }|	| ||d	 d< ||d	 d< ||d	 d< |dur#|dur#|	du rjjd }	|}t|d }tt|	|	| }dgdg|  dg }g }g }|d }t|tr |g}|d g|d< |D ]7}|d| | ||| d  }|| t|}|rFdg| }|dgt|t|  7 }|| q"t|trtdd t|jD dd d}|d \}}|d \}}|d	 d o|
|d	 d< d|d	 d< jd&d|| gi|d	 }t|jd  | krjjdkr҇ fdd|D } fdd|D } fdd|d D |d< n'jjd kr fd!d|D } fd"d|D } fd#d|d D |d< t|tr|du r|d }|d d |d< |d }|	t||d |d$|d% |S )'a	  
        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.

        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.

        Args:
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional* defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):
                The token id that will be used for the first place of the subsequence that is reserved to store image
                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
            add_eos_token (`bool`, defaults to `False`):
                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
        Nz*You have to specify either images or text.Ztokenizer_init_kwargsr$   r   r   r)   r   r#   r    Fr%   r&   return_tensors)r   c                    s   g | ]
} j j | qS r   )r-   	bos_token)r7   srC   r   r   r9          z-Kosmos2Processor.__call__.<locals>.<listcomp>rI      r   	input_idsattention_maskc                 S   s   g | ]
\}}|t |fqS r   len)r7   idxr8   r   r   r   r9      rO   c                 S   s   | d S Nr   )r8   r   r   r   <lambda>   s    z+Kosmos2Processor.__call__.<locals>.<lambda>)keyrW   rightc                    s&   g | ]}|j jg t|   qS r   r-   Zpad_token_idrT   r6   max_len_paddedrC   r   r   r9        & c                    "   g | ]}|d g t |   qS r   rS   r6   r]   r   r   r9         c                    r_   r`   rS   r6   ra   r   r   r9   
  rb   leftc                    s&   g | ]}j jg t|  | qS r   r[   r6   r\   r   r   r9     r^   c                    "   g | ]}d g t |  | qS r`   rS   r6   ra   r   r   r9     rb   c                    rd   r`   rS   r6   ra   r   r   r9     rb   )rQ   rR   image_embeds_position_mask)dataZtensor_typer   )
ValueErrorZ_merge_kwargsr"   r-   Zinit_kwargspop
setdefaultr   r,   updatepreprocess_examples
isinstancer4   rL   listZunk_token_idr   r?   r@   copyrT   sorted	enumeraterQ   Zpadding_sider   )rC   rH   rI   ZaudioZvideosrD   Zoutput_kwargsr   r   r   r    r%   r&   rK   encodingZimage_encodingtext_encodingZwith_bosstart_indexZimage_token_idsZbase_image_embeds_position_maskrQ   re   Zall_input_idsZtext_idsmaskZsorted_length_Zmin_len_not_paddedrU   r   r\   r   __call__   s   




 









zKosmos2Processor.__call__c                 C   s   |du rdS t |tstd|D ];}|du rqt |ts |g}|D ])}t |trGt|dkr8tdd |D sKt|dkrGtdd |D sKtdq"qdS )	a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c                 s       | ]}t |tV  qd S N)rl   r   r6   r   r   r   	<genexpr>C      zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>r2   c                 s   rx   ry   )rl   r   r6   r   r   r   rz   D  r{   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)rl   rm   rg   tuplerT   all)rC   r   bboxelementr   r   r   _check_bboxes_for_single_text*  s&   


z.Kosmos2Processor._check_bboxes_for_single_textc                 C   s.   |  }|d ur| d| }| ||}|S )N )strip_insert_patch_index_tokens)rC   rI   imager   img_info_tokensr   r   r   _preprocess_single_exampleM  s
   z+Kosmos2Processor._preprocess_single_exampler)   textsr   r   c           	         s@  j g| }dj g| jg  d}t|trd}|g}|du r+dgt| }nt|s2|g}t|t|krItdt| dt| d|sT| |g}n|durlt|t	satd|D ]}| qcndgt| }t|t|krtd	t| dt| d fd
dt
|||D }|s|d }|S )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
        r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got c                    s"   g | ]\}}} ||| qS r   )r   )r7   rI   r   r~   r   rC   r   r   r9     s    z8Kosmos2Processor.preprocess_examples.<locals>.<listcomp>r   )r=   joinr>   rl   r4   rT   r	   rg   r   rm   zip)	rC   r   rH   r   r   Z
img_tokensZbatchedr8   resultr   r   r   rk   W  sD   



z$Kosmos2Processor.preprocess_examplesc                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r-   batch_decoderC   argsrD   r   r   r   r        zKosmos2Processor.batch_decodec                 O   r   )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r-   decoder   r   r   r   r     r   zKosmos2Processor.decodeTc                 C   s    | | jd }|rt|S |S rV   )splitr>   +clean_text_and_extract_entities_with_bboxes)rC   rI   cleanup_and_extractcaptionr   r   r   post_process_generation  s   z(Kosmos2Processor.post_process_generationc                    s(    j |fd|i|} fdd|D S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `List[str]`: The decoded text.
        skip_special_tokensc                    s   g | ]	} j |d dqS )F)r   )r   )r7   rI   rN   r   r   r9     s    zDKosmos2Processor.post_process_image_text_to_text.<locals>.<listcomp>)r   )rC   Zgenerated_outputsr   rD   Zgenerated_textsr   rN   r   post_process_image_text_to_text  s   z0Kosmos2Processor.post_process_image_text_to_textc                 C   s"   | j j}| jj}tt|| S ry   )r-   model_input_namesr,   rm   dictfromkeys)rC   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   r     s   z"Kosmos2Processor.model_input_namesc                 C   sP  |d u s
t |dkr|S ttjd|d}t |t |kr,tdt | dt | dd}g }t||D ]\\}}| \}}	||||	  |	}|d u rOq5t|t	rW|g}g }
t
dd |D sftd	|D ]}| |\}}|
| d
|  qht |
dkrq5d|
}|d| d q5|t |k r|||d   d|}|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c                 s   s    | ]}|d uV  qd S ry   r   )r7   boxr   r   r   rz     s    z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rT   rm   refinditerrg   r   spanr@   rl   r|   r}   #_convert_bbox_to_patch_index_tokensr   )rC   rI   r   Zmatched_phrasescurr_posbufferZmatchedr~   ru   endZpatch_index_stringsr   Zpatch_index_1Zpatch_index_2Zposition_strr   r   r   r     sB   


z+Kosmos2Processor._insert_patch_index_tokensr~   c                 C   sh   t |dkr|\}}ntt| j}t||\}}dt|d d}dt|d d}||fS )Nrw   r1   r2   r3   )rT   r   mathsqrtr.   coordinate_to_patch_indexr4   r5   )rC   r~   Zidx_1Zidx_2num_patches_per_sideZtoken_1Ztoken_2r   r   r   r     s   
z4Kosmos2Processor._convert_bbox_to_patch_index_tokens)r/   )NNNN)NNr)   )T)#r   r   r   __doc__
attributesZvalid_kwargsZimage_processor_classZtokenizer_classrB   r   r   r   r   r   r"   r   rv   r   r   	BboxInputr   r   r4   rk   r   r   r   r   propertyr   r   r   r   r   __classcell__r   r   rF   r   r+   D   sb    /
 ##
C


.-
r+   r~   r   rJ   c                 C   s   | \}}}}||kr||kst dt|| }t|| }t|| d }t|| d }	|| | }
|	| | }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`Tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.rP   )rg   r   floorceil)r~   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxr   r   r   r     s   r   r   r   c                 C   s   d| }| | }| | }|| }|| }| |kr-|| }|| }	|| | }
|| | }n=||ks5||krJ|| }|| }	|| | }
|| | }n || |d  }|| |d  }	|| |d  }
|| |d  }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?rw   r   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   r   r   r   patch_index_to_coordinate'  s(   r   c              	   C   s4  d}t || }g }|D ]}|d}| \}}}|s,d}|dd |dd f}|d}	g }
|	D ];}t d|}t d|dd }|rp|rp|r_|
t|dt|df q5|
t|dt|df q5|r|||||
f q|
D ]}d|d  d	|d  d
}||||gf q~q|S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This functioin is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>rw   Nr   r0   z<patch_index_(\d+)>rP   r1   z><patch_index_r3   )	r   r   r   groupsr   searchr@   r   group)rI   patternmatchesentities_with_patch_indicesmatchr   Z
phrase_tagphraseZmatch_contentZpatch_index_pairsZentity_bboxespairr8   yr~   entityr   r   r   #extract_entities_with_patch_indicesT  s4   

$"r   c                 C   sP   | \}\}}t tdd|d| }t tdd|d| }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rT   r   sub)r   rI   entity_namestartr   Zadjusted_startZadjusted_endadjusted_entityr   r   r   adjust_entity_positions  s
   r   c                 C   s   |   }t| t|   }g }|D ]5\}\}}}t|t|  }	t|t|  }
|| |	 }|| |
 }|  }||||f|f q||fS )z9Remove the spaces around the text and the entities in it.)r   rT   r:   r;   r@   )rI   entitiesnew_textZleading_spacesZnew_entitiesr   r   r   r   Zentity_name_leading_spacesZentity_name_trailing_spacesr   r   r   _cleanup_spaces  s   r       c           
         sp   t dd| }t| }g }|D ]#}|dd |d }}t|| } fdd|D }	|||	f  qt||S )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r   r   r   rw   c                    s    g | ]}t |d  |d  qS )r   rP   )r   )r7   r~   r   r   r   r9     s     z?clean_text_and_extract_entities_with_bboxes.<locals>.<listcomp>)r   r   r   r   r@   r   )
rI   r   Zprocessed_textr   r   itemr   r   r   Zbboxes_in_coordsr   r   r   r     s   

r   )r   )'r   rn   r   r   typingr   r   r   r   Zimage_processing_utilsr   Zimage_utilsr   r	   Zprocessing_utilsr
   r   r   r   r   Ztokenization_utilsr   Ztokenization_utils_baser   r   r   r   r   r   r   r"   r+   r   r   r   r   r   r   __all__r   r   r   r   <module>   s>      *G-:


