o
    ZhOZ                    @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( e#)e*Z+dUdej,dej-dee. fddZ/	dVdej0dej-dej1de.fddZ2dVddZ3eG dd de Z4eG dd de Z5G d d! d!ej6Z7	"dWd#ej6d$ej,d%ej,d&ej,d'eej, d(e8d)e8fd*d+Z9G d,d- d-ej6Z:G d.d/ d/ej6Z;G d0d1 d1ej6Z<G d2d3 d3ej6Z=G d4d5 d5ej6Z>G d6d7 d7ej6Z?G d8d9 d9ej6Z@G d:d; d;ej6ZAG d<d= d=ej6ZBG d>d? d?ej6ZCe!G d@dA dAeZDG dBdC dCeDZEG dDdE dEeDZFG dFdG dGeeZGe!dHdIG dJdK dKeDeZHG dLdM dMej6ZIe!dNdIG dOdP dPeDZJe!dQdIG dRdS dSeDeZKg dTZLdS )XzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableListOptionalTupleUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtoZmasked_filltorchboolfinfomin)r   r    r!   bszZsrc_lenZexpanded_maskZinverted_mask r+   [/var/www/auris/lib/python3.10/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_mask+   s
   *r-   input_ids_shapedevicepast_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrFt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    )r/   r   r   r    r/   dimN)r&   fullr(   r)   aranger#   Zmasked_fill_viewr%   catzerosr$   )r.   r    r/   r0   r*   r!   r   Z	mask_condr+   r+   r,   _make_causal_mask9   s   "
 (r:   c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r3   )neintr&   ZcumsumZtype_aslong)	input_idspadding_idxr0   r   Zincremental_indicesr+   r+   r,   "create_position_ids_from_input_idsK   s   r@   c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZeej ed< dZee
ej  ed< dZeed	< d
e
e fddZdS )Kosmos2ModelOutputa
  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
            the weighted average in the self-attention heads.
        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
            The output of the [`Kosmos2VisionModel`].
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS )Ztext_model_outputrH   Ngetattrto_tuple.0kselfr+   r,   	<genexpr>   
    
z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>tuplekeysrS   r+   rS   r,   rO         zKosmos2ModelOutput.to_tuple)__name__
__module____qualname____doc__rB   r   r&   FloatTensor__annotations__rC   r   rD   rE   rF   rG   rH   r   r   rO   r+   r+   r+   r,   rA   [   s   
 &rA   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej   ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed	< dZeed
< dee fddZdS )*Kosmos2ForConditionalGenerationModelOutputaR  
    Model output class for `Kosmos2ForConditionalGeneration`.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
            the weighted average in the self-attention heads.
        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
            The output of the [`Kosmos2VisionModel`].
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
    NlosslogitsrC   rD   rE   rF   rG   rH   rI   c                    rJ   )Nc                 3   rK   rL   rM   rP   rS   r+   r,   rU      rV   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>rW   rS   r+   rS   r,   rO      rZ   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple)r[   r\   r]   r^   rb   r   r&   r_   r`   rc   rC   r   rD   rE   rF   rG   rH   r   r   rO   r+   r+   r+   r,   ra      s   
 (ra   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )Kosmos2VisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebias   r   position_ids)r   r1   
persistent)super__init__re   hidden_size	embed_dim
image_size
patch_sizer	   	Parameterr&   randnclass_embeddingZConv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr6   r$   rT   re   	__class__r+   r,   rl      s"   
"z Kosmos2VisionEmbeddings.__init__
embeddingsheightwidthrI   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr1   g      ?r
   rg   ZbicubicF)r#   modeZalign_cornersr3   )shaperx   weight	unsqueezer&   Zjit
is_tracingrh   rp   r   reshapepermuter	   
functionalZinterpolater7   r8   )rT   r}   r~   r   ru   rx   rv   Zclass_pos_embedZpatch_pos_embedr4   Z
new_heightZ	new_widthZsqrt_num_positionsr+   r+   r,   interpolate_pos_encoding   s*   



z0Kosmos2VisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().r    rg   r   r1   r3   )r   ro   
ValueErrorrt   r   r    r%   flatten	transposers   r$   r&   r8   r   rx   rh   )rT   r   r   
batch_size_r~   r   Ztarget_dtypeZpatch_embedsZclass_embedsr}   r+   r+   r,   forward  s    
zKosmos2VisionEmbeddings.forwardF)r[   r\   r]   r   rl   r&   Tensorr<   r   r_   r   __classcell__r+   r+   r{   r,   rd      s     )rd           modulequerykeyvalueattention_maskscalingdropoutc           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )Nr1   r3   ptrainingr   rg   )	r&   matmulr   r	   r   Zsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr+   r+   r,   eager_attention_forward!  s   
r   c                       sh   e Zd ZdZ fddZ			ddejdeej deej d	ee d
e	ejeej f f
ddZ
  ZS )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)rk   rl   re   rm   rn   Znum_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr	   Lineark_projv_projq_projout_projrz   r{   r+   r,   rl   :  s$   

zKosmos2VisionAttention.__init__NFrD   r   causal_attention_maskoutput_attentionsrI   c              
   C   sL  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrz| jj	dkrt|rttd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sd}||fS )
#Input shape: Batch x Time x Channelr   rg   Zflash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   )r   r   r   r   r7   r   r   r   re   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rT   rD   r   r   r   r   
seq_lengthrn   ZqueriesrY   valuesattention_interfacer   r   r+   r+   r,   r   N  sH   	






zKosmos2VisionAttention.forward)NNF)r[   r\   r]   r^   rl   r&   r   r   r'   r   r   r   r+   r+   r{   r,   r   7  s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Kosmos2VisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rk   rl   re   r   Z
hidden_actactivation_fnr	   r   rm   Zintermediate_sizefc1fc2rz   r{   r+   r,   rl     s
   
zKosmos2VisionMLP.__init__rD   rI   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   rT   rD   r+   r+   r,   r     s   


zKosmos2VisionMLP.forward)r[   r\   r]   rl   r&   r   r   r   r+   r+   r{   r,   r     s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )Kosmos2VisionEncoderLayerre   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S NZeps)rk   rl   rm   rn   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rz   r{   r+   r,   rl     s   


z"Kosmos2VisionEncoderLayer.__init__FrD   r   r   r   rI   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rD   r   r   r   )r   r   r   r   )rT   rD   r   r   r   residualr   outputsr+   r+   r,   r     s"   




z!Kosmos2VisionEncoderLayer.forwardr   )r[   r\   r]   r   rl   r&   r   r   r'   r   r_   r   r   r+   r+   r{   r,   r     s    r   c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    re   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r+   )r   rQ   r   re   r+   r,   
<listcomp>      z1Kosmos2VisionEncoder.__init__.<locals>.<listcomp>F)	rk   rl   re   r	   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrz   r{   r   r,   rl     s   
 
zKosmos2VisionEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictrI   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr+   )r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r+   )rQ   vr+   r+   r,   rU   %  s    z/Kosmos2VisionEncoder.forward.<locals>.<genexpr>)rB   rD   rE   )re   r   r   use_return_dict	enumerater   r   r   _gradient_checkpointing_func__call__rX   r   )rT   inputs_embedsr   r   r   r   r   Zencoder_statesZall_attentionsrD   idxZencoder_layerlayer_outputsr+   r+   r,   r     sF   &

zKosmos2VisionEncoder.forwardNNNNN)r[   r\   r]   r^   r   rl   r   r&   r   r'   r   r   r   r   r   r+   r+   r{   r,   r     s*    	
r   c                       sj   e Zd Zdef fddZ					ddeej dee dee d	ed
ee de	e
ef fddZ  ZS )Kosmos2VisionTransformerre   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rk   rl   re   rm   rd   r}   r	   r   r   pre_layrnormr   encoderpost_layernorm)rT   re   rn   r{   r+   r,   rl   .  s   


z!Kosmos2VisionTransformer.__init__NFr   r   r   r   r   rI   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|s[||	f|dd   S t	||	|j
|jdS )Nz You have to specify pixel_values)r   )r   r   r   r   r   r   )rB   Zpooler_outputrD   rE   )re   r   r   r   r   r}   r   r   r   r   rD   rE   )
rT   r   r   r   r   r   rD   Zencoder_outputsrB   Zpooled_outputr+   r+   r,   r   8  s2   

z Kosmos2VisionTransformer.forwardNNNFN)r[   r\   r]   r   rl   r   r&   r_   r'   r   r   r   r   r   r+   r+   r{   r,   r   ,  s(    
r   c                       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 				ddee
j dee
j dedee
j fddZdd Z  ZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nrv   embedding_dimr?   c                    s4   t    d| _|| _|| _| || j || d S )Nrg   )rk   rl   offsetr   r?   make_weights)rT   rv   r   r?   r{   r+   r,   rl   g  s
   
z1Kosmos2TextSinusoidalPositionalEmbedding.__init__num_embeddingsc                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )Nweightsr2   Fri   )get_embeddinghasattrr%   r   r    r/   ry   )rT   r   r   r?   Zemb_weightsr+   r+   r,   r   o  s   
z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        rg   i'  r   r   r   r3   r1   N)mathlogr&   expr6   Zint64floatr   r8   sincosr7   r9   r%   Zget_default_dtype)r   r   r?   Zhalf_dimZembr+   r+   r,   r   w  s   	 $&z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingr   r>   r   r0   rh   c                 C   s   |d ur|  \}}|d u rt|| j||j}n|  d d \}}|d u r.| ||}| jd | | }|| j dkrK| || j | j	| j | j
d|d||| jjd  S )Nr1   r   r   )r#   r@   r?   r%   r/   &create_position_ids_from_inputs_embedsr   r   r   r   index_selectr7   r   detach)rT   r>   r   r0   rh   r*   seq_lenZmax_posr+   r+   r,   r     s    *z0Kosmos2TextSinusoidalPositionalEmbedding.forwardc                 C   sV   |  dd }|d }tj| jd || j d tj|jd}|d| | S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr1   r   r2   r   )	r#   r&   r6   r?   r=   r/   r   r$   r   )rT   r   r0   input_shapeZsequence_lengthrh   r+   r+   r,   r     s   	zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embedsr   )NNr   N)r[   r\   r]   r^   r<   r   rl   r   staticmethodr   r&   Zno_gradr   r   r   r   r+   r+   r{   r,   r   c  s*     r   c                       s   e Zd ZdZ				ddedededed	ed
ef fddZdej	dej	fddZ
					ddej	deej	 deeej	  deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )KosmosTextAttentionr   r   FTrn   r   r   
is_decoderadd_inner_attn_layernormrf   c                    s   t    || _|| _|| _|| _|| | _| j| | jkr*td| j d| d| jd | _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d | _|rgt
j||jd| _d S d S )Nr   r   r   r   )rf   r   )rk   rl   re   rn   r   r   r   r   r   r  r	   r   r   r   r   r   inner_attn_lnr   r   )rT   re   rn   r   r   r  r  rf   r{   r+   r,   rl     s,   



zKosmosTextAttention.__init__
projectionrI   c                 C   s6   |  d d | j| jf }||dddd}|S )Nr1   r   rg   r   r
   )r#   r   r   r7   r   )rT   r  Znew_projection_shapeZnew_projectionr+   r+   r,   _shape  s   zKosmosTextAttention._shapeNrD   encoder_hidden_statespast_key_valuer   layer_head_maskr   c                 K   st  |du}|j dd \}	}
|dur|n|}|r.|r.|d j d |j d kr.|d }|d }n,| | |}| | |}|durZ|sZtj|d |gdd}tj|d |gdd}| | |}| jri||f}t}| j	j
dkr| j	j
dkr|rtd nt| j	j
 }|| ||||f| jsd	n| j| jd
|\}}||	|
d }| jdur| |}| |}|||fS )r   Nrg   r   r   r3   r   r   r   r   )r   r   r1   )r   r  r   r   r&   r8   r   r  r   re   r   r   r   r   r   r   r   r   r   r  r   )rT   rD   r	  r
  r   r  r   r   Zis_cross_attentionr   r   Zcurrent_statesZ
key_statesZvalue_statesZquery_statesr   r   r   r+   r+   r,   r     sL    





zKosmosTextAttention.forward)r   FFT)NNNNF)r[   r\   r]   r^   r<   r   r'   rl   r&   r   r  r   r   r   r   r+   r+   r{   r,   r    sP    #		r  c                       s*   e Zd Zdef fddZdd Z  ZS )Kosmos2TextFFNre   c                    sb   t    |j| _t|j | _|j| _t|j	|j
| _t|j
|j	| _tj|j
|jd| _d S r   )rk   rl   r   r   Zactivation_functionr   activation_dropoutr	   r   rn   Zffn_dimr   r   r   r   ffn_layernormrz   r{   r+   r,   rl   5  s   
zKosmos2TextFFN.__init__c                 C   sT   |  | |}tjj|| j| jd}| |}| |}tjj|| j| jd}|S )Nr   )	r   r   r	   r   r   r  r   r  r   r   r+   r+   r,   r   A  s   

zKosmos2TextFFN.forward)r[   r\   r]   r   rl   r   r   r+   r+   r{   r,   r  4  s    r  c                       s   e Zd Zdef fddZ								ddejdeej d	eej d
eej deej deej deeej  dee	 dee	 deej
eeej
ej
f  f fddZ  ZS )Kosmos2TextBlockre   c                    s   t    |j| _t|| j|j|jddd| _|j| _tj	| j|j
d| _|jr@t|| j|j|jddd| _tj	| j|j
d| _t|| _tj	| j|j
d| _d S )NT)rn   r   r   r  r  r   F)rk   rl   rn   r  attention_headsr   r   r   r	   r   r   self_attn_layer_normZadd_cross_attentionencoder_attnencoder_attn_layer_normr  ffnfinal_layer_normrz   r{   r+   r,   rl   L  s0   

zKosmos2TextBlock.__init__NFTrD   r   r	  encoder_attention_maskr  cross_attn_layer_head_maskr
  r   	use_cacherI   c
              	   K   sH  |}|d ur|d d nd }|  |}| jd	|||||d|
\}}}tjj|| j| jd}|| }d }d }|d urt| dsHtd|  d|}| |}|d urY|dd  nd }| j	d	||||||d|
\}}}tjj|| j| jd}|| }|| }|}| 
|}| |}|| }|f}|r|||f7 }|	r||f7 }|S )
Nrg   )rD   r
  r   r  r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )rD   r	  r   r  r
  r   r+   )r  r   r	   r   r   r   r   r   r  r  r  r  )rT   rD   r   r	  r  r  r  r
  r   r  r   r   Zself_attn_past_key_valueZself_attn_weightsZpresent_key_valueZcross_attn_present_key_valueZcross_attn_weightsZcross_attn_past_key_valuer   r+   r+   r,   r   i  s^   



	


zKosmos2TextBlock.forward)NNNNNNFT)r[   r\   r]   r   rl   r&   r   r   r   r'   r_   r   r   r+   r+   r{   r,   r  K  s>     	
r  c                &       sD  e Zd ZdZdef fddZdd Z					d d	eej	 d
eej	 deej	 de
deej	 f
ddZe															d!deej	 deej	 d
eej	 deej	 deej	 deej	 deej	 deej	 deeej  d	eej	 deej	 dee dee dee dee dee deeef f"ddZ  ZS )"Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    re   c                    s   t     | _ j| _ j| _ jrt jnd| _	t
j j j jd| _t j j jd| _t
 fddt jD | _t
 j j| _d| _d S )Nr"   )r?   )rv   r   r?   c                    r   r+   )r  r   r   r+   r,   r     r   z3Kosmos2TextTransformer.__init__.<locals>.<listcomp>F)rk   rl   re   r   	layerdropZscale_embeddingr   sqrtrn   embed_scaler	   rw   
vocab_sizepad_token_idembed_tokensr   Zmax_position_embeddingsembed_positionsr   r   r   r   r   
layer_normr   rz   r{   r   r,   rl     s   
 
zKosmos2TextTransformer.__init__c                 C   s`   d }|d dkrt ||j|j|d}|d ur.t||j|d d|j}|d u r*|n|| }|S )Nr1   r   )r/   r0   r!   )r:   r    r/   r-   r%   )rT   r   r  r   r0   Zcombined_attention_maskZexpanded_attn_maskr+   r+   r,   _prepare_decoder_attention_mask  s   z6Kosmos2TextTransformer._prepare_decoder_attention_maskNr   r   rF   img_input_maskr0   rh   c           	      C   s   |d u r	|  |}|d ur!||jd|d||jtjd< || j }| j||||d}||j}|| }t	j
j|| j| jd}|S )Nr1   r   )r>   r   r0   rh   r   )r  r%   r/   r7   r#   r&   r'   r  r   r	   r   r   r   )	rT   r>   r   rF   r$  r0   rh   Z	positionsrD   r+   r+   r,   forward_embedding  s"   



z(Kosmos2TextTransformer.forward_embeddingr>   r   image_embeds_position_maskr	  r  	head_maskcross_attn_head_maskrC   r  r   r   r   r   rI   c                 K   sd  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur4|
d ur4td|d urD|j}|d|d }n|
d urQ|
 d d }ntd|	d urb|	d d jd nd}|dkrld }d }| j	||
||||d}| 
||||}|d ur|d urt||
j|d d}tjj|| j| jd}| jr| jr|rtd	 d
}|rdnd }|rdnd }|r|d urdnd }|rdnd }t||gddgD ](\}}|d ur| d t| jkrtd| dt| j d| d  dqt| jD ]\}}|r
||f7 }| jrtg }|| jk rq|	d ur#|	| nd }| jrO| jrO| |j|||||d ur>|| nd |d urH|| nd d ||
}n#||f||||d ur^|| nd |d urh|| nd |||d|}|d }|r|||rdnd f7 }|r||d f7 }|d ur||d f7 }q| |}|r||f7 }t|||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer1   z5You have to specify either input_ids or inputs_embedsr   rg   )r>   r   rF   r$  r0   rh   r"  r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr+   r'  r(  zThe `z` should be specified for z layers, but it is for .)r   r	  r  r  r  r
  r   r  r
   r   )rB   rC   rD   rE   cross_attentions)re   r   r   r  r   r   r   r7   r#   r%  r#  r-   r    r	   r   r   r   r   r   r   ziplenr   r   r&   Zrandr  r   r   r!  r   )rT   r>   r   rF   r&  r	  r  r'  r(  rC   r   rh   r  r   r   r   r   r  r0   rD   Zall_hidden_statesZall_self_attnsZall_cross_attentionsZpresent_key_value_statesZ	attn_maskZ	mask_namer   Zdecoder_layerZdropout_probabilityr
  r   r+   r+   r,   r     s   	





zKosmos2TextTransformer.forward)NNNr   NNNNNNNNNNNNNNNN)r[   r\   r]   r^   r   rl   r#  r   r&   r   r<   r%  r   r   r_   r'   r   r   r   r   r   r   r   r+   r+   r{   r,   r    s    
#	

r  c                   @   s0   e Zd ZeZdZddgZdZdZdZ	dd Z
dS )Kosmos2PreTrainedModelTr   r  c                 C   s  t | tr
| jj}nt | ttfr| jjj}t | ttfr"| jj	}nt | ttfr.| jj
j	}t |tr`tjj|jd|jd | d tjj|jj|jj| d tjj|jj|jj| d dS t |tr|jd d|jj d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  dS dS t |tr/|jjd d|jj d  | }d|jj d | }tjj|j j|d tjj|j!j|d |j jdur|j jj  |j!jdur-|j!jj  dS dS t |t"rU|j#jj  |j#jj$d |j%jj  |j%jj$d dS t |t&r{|j'jj  |j'jj$d |j(jj  |j(jj$d dS t |t)rtjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  dS dS t |t*rtjj|j j|d tjj|j!j|d |j jdur|j jj  |j!jdur|j!jj  dS dS t |trAtjj|j+j|d |j+jdur?|j+jj  dS dS t |t,rctjj|j-j|d |j-jdura|j-jj  dS dS t |t.r|j/jjjd|d |j/j0dur|j/jj|j/j0   dS dS dS )zInitialize the weightsr   r   )meanstd)r0  rg   Nr"   )1
isinstanceKosmos2VisionModelre   Zinitializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMZinit_stdtext_configrd   r	   initZnormal_rs   rn   rt   r   Zinitializer_rangerx   r   r   r   r   r   r   rf   dataZzero_r   rm   r   r   r   r   Zfill_r   r   r   r   r  r  lm_headKosmos2ImageToTextProjectiondenser  r  r?   )rT   r   factorr0  Zin_proj_stdZout_proj_stdZfc_stdr+   r+   r,   _init_weights  s   





 
 z$Kosmos2PreTrainedModel._init_weightsN)r[   r\   r]   r   config_classZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_attention_backendZ_supports_flash_attn_2Z_supports_sdpar?  r+   r+   r+   r,   r.    s    r.  c                       s   e Zd ZeZdZdef fddZdejfddZ	e
						ddeej d
ee dee dedee deeef fddZ  ZS )r2  r   re   c                    "   t  | t|| _|   d S r   )rk   rl   r   model	post_initrz   r{   r+   r,   rl        
zKosmos2VisionModel.__init__rI   c                 C   
   | j jjS r   )rB  r}   rt   rS   r+   r+   r,   get_input_embeddings     
z'Kosmos2VisionModel.get_input_embeddingsNFr   r   r   r   c                 C   s   | j |||||dS )N)r   r   r   r   r   rB  )rT   r   r   r   r   r   r+   r+   r,   r     s   	zKosmos2VisionModel.forwardr   )r[   r\   r]   r   r@  main_input_namerl   r	   ModulerF  r   r   r&   r_   r'   r   r   r   r   r   r+   r+   r{   r,   r2  	  s0    
r2  c                '       s  e Zd ZeZdef fddZdejfddZdd Z	e
e																														dd
eej deej deej deej deej deej deej deej deeej  deej deej dee dee dee dee dee deeef f"ddZ  ZS )r6  re   c                    rA  r   )rk   rl   r  rB  rC  rz   r{   r+   r,   rl   -  rD  zKosmos2TextModel.__init__rI   c                 C      | j jS r   rB  r  rS   r+   r+   r,   rF  3     z%Kosmos2TextModel.get_input_embeddingsc                 C      || j _d S r   rL  rT   r   r+   r+   r,   set_input_embeddings6     z%Kosmos2TextModel.set_input_embeddingsNr>   r   rF   r&  r	  r  r'  r(  rC   r   rh   r  r   r   r   r   c                 K   s0   | j d|||||||||	|
|||||d|S )a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        r>   r   rF   r&  r	  r  r'  r(  rC   r   rh   r  r   r   r   Nr+   rH  )rT   r>   r   rF   r&  r	  r  r'  r(  rC   r   rh   r  r   r   r   r   r+   r+   r,   r   9  s&   $zKosmos2TextModel.forwardr-  )r[   r\   r]   r   r@  rl   r	   rJ  rF  rP  r   r   r   r&   r   r   r_   r'   r   r   r   r   r   r   r   r+   r+   r{   r,   r6  *  sr    	

r6  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r[   r\   r]   r+   r+   r+   r,   rS  q  s    rS  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )Zcustom_introc                )       sf  e Zd ZeZdgZdef fddZdejfddZ	dd	 Z
dejfd
dZdd Zee																d&deej deej deej deej deej deej deej deej deeej  deej deej deej dee dee dee dee dee deeef f$d d!Z						d' fd"d#	Zed$d% Z  ZS )(r7  zlm_head.weightre   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NF)Zin_featuresZout_featuresrf   )
rk   rl   r  rB  r	   r   rn   r  r;  rC  rz   r{   r+   r,   rl   ~  s   
zKosmos2TextForCausalLM.__init__rI   c                 C   rK  r   rL  rS   r+   r+   r,   rF    rM  z+Kosmos2TextForCausalLM.get_input_embeddingsc                 C   rN  r   rL  rO  r+   r+   r,   rP    rQ  z+Kosmos2TextForCausalLM.set_input_embeddingsc                 C   s   | j S r   r;  rS   r+   r+   r,   get_output_embeddings  s   z,Kosmos2TextForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r   rT  rT   Znew_embeddingsr+   r+   r,   set_output_embeddings  rG  z,Kosmos2TextForCausalLM.set_output_embeddingsNr>   r   rF   r&  r	  r  r'  r(  rC   r   rh   labelsr  r   r   r   r   c                 K   s   |dur|n| j j}|dur|rtd d}| jd	|||||||||	|
||||dd|}| |d }d}|durJ| jd	||| j jd|}t|||j	|j
|j|jdS )
aK  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FTrR  r   )rc   rX  r  )rb   rc   rC   rD   rE   r*  r+   )re   r   r   warningrB  r;  Zloss_functionr  r   rC   rD   rE   r*  )rT   r>   r   rF   r&  r	  r  r'  r(  rC   r   rh   rX  r  r   r   r   r   r   Z	lm_logitsrb   r+   r+   r,   r     sH   )
zKosmos2TextForCausalLM.forwardc              
      s   t || jjdd}	|d urd }d }n%|d ur7| \}
}| d }tj|tj|
|| ftj|jdfdd}t	 j
|f||||||	|d|}|S )Nr   )r?   r0   r1   )r#   r    r/   r   r3   )rC   r   rF   r&  r  rh   cache_position)r@   re   r  r#   r&   r8   r9   r'   r/   rk   prepare_inputs_for_generation)rT   r>   rF   r&  rC   r   r  rZ  Zmodel_kwargsrh   r   r   Zmask_lenZmodel_inputsr{   r+   r,   r[    s>   	z4Kosmos2TextForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr+   c                 3   s$    | ]}| d  |jV  qdS )r   N)r   r%   r/   )rQ   Z
past_statebeam_idxr+   r,   rU     s   " z8Kosmos2TextForCausalLM._reorder_cache.<locals>.<genexpr>)rX   )rC   r]  Zreordered_pastZ
layer_pastr+   r\  r,   _reorder_cache  s   z%Kosmos2TextForCausalLM._reorder_cache)NNNNNNNNNNNNNNNN)NNNNNN) r[   r\   r]   r   r@  _tied_weights_keysrl   r	   rJ  rF  rP  rU  rW  r   r   r   r&   r   r   r_   
LongTensorr'   r   rS  r   r   r   r   r[  r  r^  r   r+   r+   r{   r,   r7  t  s    		

R1r7  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r<  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)re   c                    sb   t    t|jj|jj| _t	t
|j|jj| _t|j|jj|jj|jjddd| _d S )NF)r   r  r  )rk   rl   r	   r   r5  rm   r8  rn   r=  rq   r&   rr   Zlatent_query_numlatent_queryr  r  r   x_attnrz   r{   r+   r,   rl   #  s   
z%Kosmos2ImageToTextProjection.__init__c                 C   sZ   |  |}| jd|ddd}tj||gdd}| j||d d d d\}}}||fS )Nr   r1   r   r3   )rD   r	  r
  r   r   )r=  ra  r   r$   r#   r&   r8   rb  )rT   featuresrD   ra  Zkey_value_statesr   r   r+   r+   r,   r   1  s   
z$Kosmos2ImageToTextProjection.forward)r[   r\   r]   r^   r   rl   r   r   r+   r+   r{   r,   r<     s    r<  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                %       s.  e Zd ZeZdZdef fddZdejfddZ	dd	 Z
	
	
ddejdee dee fddZee													
	d deej deej deej deej deej deeej  deej deej deej dee dee dee dedee dee deeef f ddZ  ZS )!r3  r   re   c                    :   t  | t|j| _t|j| _t|| _	| 
  d S r   )rk   rl   r6  r8  
text_modelr2  r5  vision_modelr<  image_to_text_projectionrC  rz   r{   r+   r,   rl   L  s
   
zKosmos2Model.__init__rI   c                 C   rE  r   re  rB  r  rS   r+   r+   r,   rF  V  rG  z!Kosmos2Model.get_input_embeddingsc                 C      || j j_d S r   rh  rO  r+   r+   r,   rP  Y     z!Kosmos2Model.set_input_embeddingsFreturn_attentionsr   c                 C   sN   | j ||d}| j j|d }tjj|dd}| |\}}|r%||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r1   r3   )rf  rB  r   r	   r   	normalizerg  )rT   r   rk  r   rH   rF   rG   r+   r+   r,   get_image_features\  s   zKosmos2Model.get_image_featuresNr>   r&  r   r'  rC   rF   r   rh   r  r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}d}|du r8|du r.td| j|d|d\}}| jd||||||||	|
||dd|}t|j|j	|j
|j|||dS )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)rk  r   )r>   r   rF   r&  r'  rC   r   rh   r  r   r   r   )rB   rC   rD   rE   rF   rG   rH   r+   )re   r   r   r   r   rm  re  rA   rB   rC   rD   rE   )rT   r   r>   r&  r   r'  rC   rF   r   rh   r  r   r   r   r   r   rH   rG   r   r+   r+   r,   r   {  sJ   <
zKosmos2Model.forward)FF)NNNNNNNNNNNNFN)r[   r\   r]   r   r@  rI  rl   r	   rJ  rF  rP  r&   r_   r   r'   rm  r   r   r   r   r   r   r   r   rA   r   r   r+   r+   r{   r,   r3  C  s    

	

r3  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                %       st  e Zd ZeZdZdgZdef fddZdej	fddZ
d	d
 Zdej	fddZdd Zee														d"deej deej deej deej deej deeej  deej deej deej deej dee dee dee dee dee deeef f ddZ					d#deej deej deej deej deej f
d d!Z  ZS )$r4  r   ztext_model.lm_head.weightre   c                    rd  r   )rk   rl   r7  r8  re  r2  r5  rf  r<  rg  rC  rz   r{   r+   r,   rl     s
   
z(Kosmos2ForConditionalGeneration.__init__rI   c                 C   rE  r   rh  rS   r+   r+   r,   rF    rG  z4Kosmos2ForConditionalGeneration.get_input_embeddingsc                 C   ri  r   rh  rO  r+   r+   r,   rP    rj  z4Kosmos2ForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S r   )re  rU  rS   r+   r+   r,   rU    rG  z5Kosmos2ForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S r   )re  rW  rV  r+   r+   r,   rW     s   z5Kosmos2ForConditionalGeneration.set_output_embeddingsNr>   r&  r   r'  rC   rF   r   rh   rX  r  r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}d}|du rO|du r.td| j||||d}| jj|d }tj	j
|dd}| |\}}| jd
||||||||	|
|||dd|}t|j|j|j|j|j|||d	S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nrn  )r   r   r   r   r   r1   r3   T)r>   r   rF   r&  r'  rC   r   rh   rX  r  r   r   r   )rb   rc   rC   rD   rE   rF   rG   rH   r+   )re   r   r   r   r   rf  rB  r   r	   r   rl  rg  re  ra   rb   rc   rC   rD   rE   )rT   r   r>   r&  r   r'  rC   rF   r   rh   rX  r  r   r   r   r   rH   rG   Z
lm_outputsr+   r+   r,   r     sZ   Hz'Kosmos2ForConditionalGeneration.forwardc                 K   s   | dd }|d ur|d urtd| d|d u r |d ur |}|d u rA| |}| jj|d }tjj|dd}| |\}}	| j	j
d||||d|}
|
S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r1   r3   )r>   r   rF   r&  r+   )popr   rf  rB  r   r	   r   rl  rg  re  generate)rT   r   r&  r>   r   rF   r   ro  rH   rG   outputr+   r+   r,   rq    s*   


z(Kosmos2ForConditionalGeneration.generate)NNNNNNNNNNNNNNr   )r[   r\   r]   r   r@  rI  r_  rl   r	   rJ  rF  rP  rU  rW  r   r   r   r&   r   r   r_   r`  r'   r   rS  r   r   ra   r   rq  r   r+   r+   r{   r,   r4    s    	

|r4  )r4  r3  r.  r   )r   )r   )Mr^   r   dataclassesr   typingr   r   r   r   r   r   r&   Ztorch.utils.checkpointr	   Zactivationsr   Z
generationr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   r   Zconfiguration_kosmos2r   r   r   Z
get_loggerr[   r   r   r    r<   r-   Sizer/   r:   r@   rA   ra   rJ  rd   r   r   r   r   r   r   r   r   r  r  r  r  r.  r2  r6  rS  r7  r<  r3  r4  __all__r+   r+   r+   r,   <module>   s     
 

6:[
P3b7Xyr p\!G '#  ?