
    fThOZ                    l   S r SSKrSSKJr  SSKJrJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'J(r(  \#RR                  " \*5      r+SPS\RX                  S\RZ                  S\\.   4S jjr/ SQS\R`                  S\RZ                  S\Rb                  S\.4S jjr2SQS jr3\ " S S\ 5      5       r4\ " S S\ 5      5       r5 " S S\Rl                  5      r7 SRS\Rl                  S \RX                  S!\RX                  S"\RX                  S#\\RX                     S$\8S%\84S& jjr9 " S' S(\Rl                  5      r: " S) S*\Rl                  5      r; " S+ S,\Rl                  5      r< " S- S.\Rl                  5      r= " S/ S0\Rl                  5      r> " S1 S2\Rl                  5      r? " S3 S4\Rl                  5      r@ " S5 S6\Rl                  5      rA " S7 S8\Rl                  5      rB " S9 S:\Rl                  5      rC\! " S; S<\5      5       rD " S= S>\D5      rE " S? S@\D5      rF " SA SB\\5      rG\!" SCSD9 " SE SF\D\5      5       rH " SG SH\Rl                  5      rI\!" SISD9 " SJ SK\D5      5       rJ\!" SLSD9 " SM SN\D\5      5       rK/ SOQrLg)SzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableListOptionalTupleUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   U R                  5       u  p4Ub  UOUnU SS2SSSS24   R                  USX$5      R                  U5      nSU-
  nUR                  UR                  [        R
                  5      [        R                  " U5      R                  5      $ )zW
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r    r!   r"   bszsrc_lenexpanded_maskinverted_masks          d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr2   +   s     99;LC ,g'GD$)*11#q'KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc           	         U u  pE[         R                  " XU4[         R                  " U5      R                  US9n[         R                  " UR                  S5      US9nUR                  XwS-   R                  UR                  S5      S5      :  S5        UR                  U5      nUS:  a*  [         R                  " [         R                  " XSXS9U/SS9nUSSSS2SS24   R                  USXUU-   5      $ )z:
Make causal mask used for bi-directional self-attention.
)r5   r   r   r!   r5   dimN)r)   fullr+   r,   aranger%   masked_fill_viewr'   catzerosr&   )r4   r!   r5   r6   r-   r"   r    	mask_conds           r1   _make_causal_maskrC   9   s     #LC::w(%++e*<*@*@PDTYYr]6:Iiq="6"6tyy}a"HH!L775>D!yy%++gUbdhioqrdAq !((aDZ:Z[[r3   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r:   )neintr)   cumsumtype_aslong)	input_idspadding_idxr6   r    incremental_indicess        r1   "create_position_ids_from_input_idsrM   K   sW     <<$((*D <<!4<<TBE[[_cc##%33r3   c                   J   \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S	'   Sr\\	S
'   S\\   4S jrSrg)Kosmos2ModelOutput[   a>
  
Base class for text model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output(`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)text_model_outputrW   Ngetattrto_tuple.0kselfs     r1   	<genexpr>.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   <      
   LLDGRYZ^`aRbRkRkRmm    25tuplekeysrc   s   `r1   r_   Kosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r3    )__name__
__module____qualname____firstlineno____doc__rQ   r   r)   FloatTensor__annotations__rR   r   rS   rT   rU   rV   rW   r   r   r_   __static_attributes__rn   r3   r1   rO   rO   [   s    $L 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r3   rO   c                   r   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\\R                        \	S
'   Sr\\	S'   S\\   4S jrSrg)*Kosmos2ForConditionalGenerationModelOutput   a
  
Model output class for `Kosmos2ForConditionalGeneration`.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output(`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
NlosslogitsrR   rS   rT   rU   rV   rW   rX   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr[   r]   r`   s     r1   rd   FKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rf   rg   rh   rk   s   `r1   r_   3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rm   r3   rn   )ro   rp   rq   rr   rs   rz   r   r)   rt   ru   r{   rR   r   rS   rT   rU   rV   rW   r   r   r_   rv   rn   r3   r1   rx   rx      s    &P )-D(5$$
%,*.FHU&&'.AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r3   rx   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )Kosmos2VisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r8   
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer
   	Parameterr)   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr=   r&   rc   r   	__class__s     r1   r    Kosmos2VisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr3   
embeddingsheightwidthrX   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr8   g      ?r   r   bicubicF)r%   modealign_cornersr:   )shaper   weight	unsqueezer)   jit
is_tracingr   r   r   reshapepermuter
   
functionalinterpolater?   r@   )rc   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr;   
new_height	new_widthsqrt_num_positionss                r1   interpolate_pos_encoding0Kosmos2VisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr3   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r!   r   r   r8   r:   )r   r   
ValueErrorr   r   r!   r'   flatten	transposer   r&   r)   r@   r   r   r   )rc   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r1   forwardKosmos2VisionEmbeddings.forward  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr3   )	r   r   r   r   r   r   r   r   r   F)ro   rp   rq   rr   r   r   r)   TensorrF   r   rt   r   rv   __classcell__r   s   @r1   r   r      sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r3   r   modulequerykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr8   r:   ptrainingr   r   )	r)   matmulr   r
   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardr   !  s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r3   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )Kosmos2VisionAttentioni7  =Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr
   Lineark_projv_projq_projout_projr   s     r1   r   Kosmos2VisionAttention.__init__:  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar3   rS   r   causal_attention_maskoutput_attentionsrX   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r?   r   r   r   r   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rc   rS   r   r   r   r   
seq_lengthr   queriesrj   valuesattention_interfacer   r   s                 r1   r   Kosmos2VisionAttention.forwardN  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r3   )r   r   r   r   r   r   r   r   r   r   r   )NNF)ro   rp   rq   rr   rs   r   r)   r   r   r*   r   r   rv   r   r   s   @r1   r   r   7  s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r3   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Kosmos2VisionMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r   r   r   r   
hidden_actactivation_fnr
   r   r   intermediate_sizefc1fc2r   s     r1   r   Kosmos2VisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr3   rS   rX   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r
  r  r  rc   rS   s     r1   r   Kosmos2VisionMLP.forward  s4    /**=9/r3   )r  r   r
  r  )
ro   rp   rq   rr   r   r)   r   r   rv   r   r   s   @r1   r  r    s)    KU\\ ell  r3   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )Kosmos2VisionEncoderLayeri  r   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r   r   r   r   r   	self_attnr
   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r1   r   "Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr3   rS   r   r   r   rX   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rS   r   r   r   )r  r  r  r  )rc   rS   r   r   r   residualr   outputss           r1   r   !Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr3   )r   r  r  r  r  r   )ro   rp   rq   rr   r   r   r)   r   r   r*   r   rt   r   rv   r   r   s   @r1   r  r    sl    S2 S -2&||& &  %||	&
 $D>& 
u  	!& &r3   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )Kosmos2VisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Kosmos2VisionEncoderLayer`].

Args:
    config: Kosmos2VisionConfig
r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r
   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointingrc   r   r   r   s      r1   r   Kosmos2VisionEncoder.__init__  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A&r   r   r   output_hidden_statesreturn_dictrX   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nrn   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr  rn   )ra   vs     r1   rd   /Kosmos2VisionEncoder.forward.<locals>.<genexpr>%  s     e$Sq$Ss   	)rQ   rS   rT   )r   r   r+  use_return_dict	enumerater'  r(  r   _gradient_checkpointing_func__call__ri   r   )rc   inputs_embedsr   r   r   r+  r,  encoder_statesall_attentionsrS   idxencoder_layerlayer_outputss                r1   r   Kosmos2VisionEncoder.forward  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r3   )r   r(  r'  NNNNN)ro   rp   rq   rr   rs   r   r   r   r)   r   r*   r	   r   r   r   rv   r   r   s   @r1   r"  r"    s    ,2 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r3   r"  c                      ^  \ rS rSrS\4U 4S jjr     SS\\R                     S\\	   S\\	   S\	S\\	   S	\
\\4   4S
 jjrSrU =r$ )Kosmos2VisionTransformeri,  r   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r  )r   r   r   r   r   r   r
   r  r  pre_layrnormr"  encoderpost_layernorm)rc   r   r   r   s      r1   r   !Kosmos2VisionTransformer.__init__.  sd    &&	1&9LL8M8MN+F3 ll9:O:OPr3   r   r   r+  r   r,  rX   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r5  r   r+  r,  r   r   )rQ   pooler_outputrS   rT   )r   r   r+  r1  r   r   r@  rA  rB  r   rS   rT   )
rc   r   r   r+  r   r,  rS   encoder_outputsrQ   pooled_outputs
             r1   r    Kosmos2VisionTransformer.forward8  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r3   )r   r   rA  rB  r@  NNNFN)ro   rp   rq   rr   r   r   r   r)   rt   r*   r	   r   r   r   rv   r   r   s   @r1   r>  r>  ,  s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
 '
r3   r>  c                   2  ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5           SS
\\R                     S\\R                     S\S\\R                     4S jj5       rS rSrU =r$ )(Kosmos2TextSinusoidalPositionalEmbeddingic  zDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrK   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g )Nr   )r   r   offsetrL  rK   make_weights)rc   r   rL  rK   r   s       r1   r   1Kosmos2TextSinusoidalPositionalEmbedding.__init__g  s8    *&-++5}Rr3   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )Nweightsr9   Fr   )get_embeddinghasattrr'   rS  r!   r5   r   )rc   rQ  rL  rK   emb_weightss        r1   rO  5Kosmos2TextSinusoidalPositionalEmbedding.make_weightso  s\    ((T4##%..t||/A/A$,,J]J].^KYFr3   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r   i'  r   r   r   r:   r8   N)mathlogr)   expr=   int64floatr   r@   sincosr?   rA   r'   get_default_dtype)rQ  rL  rK   half_dimembs        r1   rT  6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingw  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r3   rJ   r5  r6   r   c                 p   UbE  UR                  5       u  pVUc/  [        XR                  U5      R                  UR                  5      nO)UR                  5       S S u  pVUc  U R                  X#5      nU R                  S-   U-   U-   nXpR                  R                  S5      :  a3  U R                  XpR                  -   U R                  U R                  5        U R                  R                  SUR                  S5      5      R                  XVU R                  R                  S   5      R                  5       $ )Nr8   r   r   )r%   rM   rK   r'   r5   &create_position_ids_from_inputs_embedsrS  rO  rN  rL  index_selectr?   r   detach)rc   rJ   r5  r6   r   r-   seq_lenmax_poss           r1   r   0Kosmos2TextSinusoidalPositionalEmbedding.forward  s     $>>+LC#A//1G "Y%%&  )--/4LC##JJ=q ""Q&03II\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr3   c                 *   UR                  5       SS nUS   n[        R                  " U R                  S-   X@R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      R                  5       U-   $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr8   r   r9   r   )	r%   r)   r=   rK   rI   r5   r   r&   r   )rc   r5  r6   input_shapesequence_lengthr   s         r1   re  OKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr3   )rL  rN  rK   r  )NNr   N)ro   rp   rq   rr   rs   rF   r   r   rO  staticmethodrT  r)   no_gradr   r   re  rv   r   r   s   @r1   rK  rK  c  s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( ]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6c cr3   rK  c                     ^  \ rS rSrSr    SS\S\S\S\S\S\4U 4S	 jjjrS
\	R                  S\	R                  4S jr     SS\	R                  S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )KosmosTextAttentioni  r   r   r   r   
is_decoderadd_inner_attn_layernormr   c                 >  > [         TU ]  5         Xl        X l        X0l        X@l        X#-  U l        U R                  U-  U R                  :w  a  [        SU R                   SU S35      eU R                  S-  U l        XPl	        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        S U l        U(       a$  [        R"                  " X!R$                  S9U l        g g )Nr   r   r   r   )r   r  )r   r   r   r   r   r   r   r   r   rs  r
   r   r   r   r   r   inner_attn_lnr  r  )	rc   r   r   r   r   rs  rt  r   r   s	           r1   r   KosmosTextAttention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$ii	4@ii	4@ii	4@		)TB "#!#i=R=R!SD $r3   
projectionrX   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      R	                  SSSS5      nU$ )Nr8   r   r   r   r   )r%   r   r   r?   r   )rc   rx  new_projection_shapenew_projections       r1   _shapeKosmosTextAttention._shape  sO    )0"58WW#)=>FFq!QPQRr3   rS   encoder_hidden_statespast_key_valuer   layer_head_maskr   c                    USLnUR                   SS u  pUb  UOUnU(       a5  U(       a.  US   R                   S   UR                   S   :X  a  US   nUS   nO~U R                  U R                  U5      5      nU R                  U R                  U5      5      nUb;  U(       d4  [        R
                  " US   U/SS9n[        R
                  " US   U/SS9nU R                  U R                  U5      5      nU R                  (       a  X4n[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UUUU4U R                  (       d  S	OU R                  U R                   S
.UD6u  nnUR#                  XS5      R%                  5       nU R&                  b  U R'                  U5      nU R)                  U5      nUUU4$ )r   Nr   r   r   r:   r   r   r   r   )r   r   r8   )r   r|  r   r   r)   r@   r   rs  r   r   r   r   r   r   r   r   r   r   r   rv  r   )rc   rS   r~  r  r   r  r   r   is_cross_attentionr   r   current_states
key_statesvalue_statesquery_statesr  r   r   s                     r1   r   KosmosTextAttention.forward  s    3$>!.!4!4Ra!8
 3H2S.Yf .^A5F5L5LQ5OSaSgSghiSj5j'*J)!,LT[[%@AJ;;t{{>'BCL)2D"YYq(9:'FAN
$yy.*;\)JPQR{{4;;}#=>?? )7N(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))*"EPPR),,[9KmmK0L.88r3   )r   r   r   r   rv  rs  r   r   r   r   r   r   )r   FFT)NNNNF)ro   rp   rq   rr   rs   rF   r]  r*   r   r)   r   r|  r   r   r   rv   r   r   s   @r1   rr  rr    s:   G  ).!T !T 	!T
 !T !T #'!T !T !TF %,,  9=8<1526"'I9||I9  (5I9 !u||!45	I9
 !.I9 "%,,/I9  I9 
u||Xell3XeELL>Q5RR	SI9 I9r3   rr  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Kosmos2TextFFNi4  r   c                   > [         TU ]  5         UR                  U l        [        UR                     U l        UR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        g r  )r   r   r   r   activation_functionr  activation_dropoutr
   r   r   ffn_dimr
  r  r  r  ffn_layernormr   s     r1   r   Kosmos2TextFFN.__init__5  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr3   c                 R   U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R                  S9nU$ )Nr   )	r  r
  r
   r   r   r  r   r  r  r  s     r1   r   Kosmos2TextFFN.forwardA  s    **488M+BC--m?V?Vaeanan-o**=9/--m||VZVcVc-dr3   )r  r  r   r
  r  r  )	ro   rp   rq   rr   r   r   r   rv   r   r   s   @r1   r  r  4  s    
U0 
U r3   r  c                     ^  \ rS rSrS\4U 4S jjr        SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	\R                        S\\
   S\\
   S\	\R                  \\	\R                  \R                  4      4   4S jjrSrU =r$ )Kosmos2TextBlockiK  r   c           	        > [         TU ]  5         UR                  U l        [        UU R                  UR                  UR
                  SSS9U l        UR                  U l        [        R                  " U R                  UR                  S9U l        UR                  (       a_  [        UU R                  UR                  UR
                  SSS9U l        [        R                  " U R                  UR                  S9U l        [        U5      U l        [        R                  " U R                  UR                  S9U l        g )NT)r   r   r   rs  rt  r  F)r   r   r   rr  attention_headsr   r  r   r
   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normr  ffnfinal_layer_normr   s     r1   r   Kosmos2TextBlock.__init__L  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr3   rS   r   r~  encoder_attention_maskr  cross_attn_layer_head_maskr  r   	use_cacherX   c
           
         UnUb  US S OS nU R                  U5      nU R                  " S	UUUUUS.U
D6u  pn[        R                  R	                  XR                  U R
                  S9nX-   nS nS nUb  [        U S5      (       d  [        SU  S35      eUnU R                  U5      nUb  USS  OS nU R                  " S	UUUUUUS.U
D6u  nnn[        R                  R	                  XR                  U R
                  S9nX-   nX-   nUnU R                  U5      nU R                  U5      nX-   nU4nU(       a  UUU4-  nU	(       a  UU4-  nU$ )
Nr   )rS   r  r   r  r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )rS   r~  r   r  r  r   rn   )r  r  r
   r   r   r   rU  r   r  r  r  r  )rc   rS   r   r~  r  r  r  r  r   r  r   r  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer  s                      r1   r   Kosmos2TextBlock.forwardi  s    ! :H9S>"1#5Y] 11-@ ?Cnn ?
'3)+/?
 ?
;*; --m||VZVcVc-d 0 (,$! ,400 =dV DD D 
 %H 88GM @N?Yrs(;_c%NRN_N_ O+&;5 :8"3O OKM-/K MM11-<<Z^ZgZg1hM$4M !2 P !--m< / 0 ")+=>>G)++Gr3   )r   r   r  r  r  r  r  r  )NNNNNNFT)ro   rp   rq   rr   r   r   r)   r   r   r   r*   rt   r   rv   r   r   s   @r1   r  r  K  s   X0 X@ 268<9=26=A8<,1$(Q||Q !.Q  (5	Q
 !) 6Q "%,,/Q %-U\\$:Q !u||!45Q $D>Q D>Q 
u  (51B1BEDUDU1U+V"WW	XQ Qr3   r  c            &         ^  \ rS rSrSrS\4U 4S jjrS r     SS\\	R                     S\\	R                     S\\	R                     S	\S
\\	R                     4
S jjr\               SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\\	R                        S\\	R                     S
\\	R                     S\\   S\\   S\\   S\\   S\\   S\\\4   4"S jj5       rSrU =r$ )Kosmos2TextTransformeri  z
Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

Args:
    config: Kosmos2TextConfig
r   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  (       a   [        R                  " UR                  5      OSU l	        [        R                  " UR                  UR                  UR                  S9U l        [        UR                   UR                  UR                  S9U l        [        R$                  " ['        UR(                  5       Vs/ s H  n[+        U5      PM     sn5      U l        [        R,                  " UR                  UR.                  5      U l        SU l        g s  snf )Nr$   )rK   )r   rL  rK   F)r   r   r   r   	layerdropscale_embeddingrY  sqrtr   embed_scaler
   r   
vocab_sizepad_token_idembed_tokensrK  max_position_embeddingsembed_positionsr$  r%  r'  r  r  r  
layer_normr(  r)  s      r1   r   Kosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmuV]]G[$\G[!%5f%=G[$\],,v'7'79N9NO&+# %]s   >Ec                     S nUS   S:  a   [        UUR                  UR                  US9nUb9  [        XR                  US   S9R	                  UR                  5      nUc  UOXe-   nU$ )Nr8   r   )r5   r6   r"   )rC   r!   r5   r2   r'   )rc   r   rl  r5  r6   combined_attention_maskexpanded_attn_masks          r1   _prepare_decoder_attention_mask6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-n>Q>Q[fgi[j!k!n!n$$" '>&E"K]Kw $ '&r3   r5  rU   img_input_maskr6   r   c                    Uc  U R                  U5      nUbW  UR                  UR                  5      R                  SUR	                  S5      5      X$R                  [
        R                  S9'   X R                  -  nU R                  UUUUS9nUR                  UR                  5      nX'-   n[        R                  R                  XR                  U R                  S9nU$ )Nr8   r   )rJ   r5  r6   r   r   )r  r'   r5   r?   r%   r)   r*   r  r  r
   r   r   r   )	rc   rJ   r5  rU   r  r6   r   	positionsrS   s	            r1   forward_embedding(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%1--m||VZVcVc-dr3   rJ   r   image_embeds_position_maskr~  r  	head_maskcross_attn_head_maskrR   r  r   r+  r,  r   rX   c                 J   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  U
b  [        S5      eUb"  UR                  nUR                  SUS   5      nO"U
b  U
R                  5       S S nO[        S5      eU	b  U	S   S   R                  S   OSnUS:  a  S nS nU R                  UU
UUUUS9nU R                  UUUU5      nUb  Ub  [        XjR                  US   S9n[        R                  R                  UU R                  U R                   S9nU R"                  (       a/  U R                   (       a  U(       a  [$        R'                  S	5        S
nU(       a  SOS nU(       a  SOS nU(       a  Ub  SOS nU(       a  SOS n[)        Xx/SS/5       Hn  u  nnUc  M  UR                  5       S   [+        U R,                  5      :w  d  M7  [        SU S[+        U R,                  5       SUR                  5       S    S35      e   [/        U R,                  5       GH  u  nnU(       a  UU4-  nU R                   (       a(  [0        R2                  " / 5      nUU R4                  :  a  MM  U	b  U	U   OS nU R"                  (       aF  U R                   (       a5  U R7                  UR8                  UUUUUb  UU   OS Ub  UU   OS S UU5
      nO"U" U4UUUUb  UU   OS Ub  UU   OS UUUS.UD6nUS   nU(       a  UUU(       a  SOS   4-  nU(       d  M  UUS   4-  nUc  GM  UUS   4-  nGM     U R;                  U5      nU(       a  UU4-  n[=        UUUUUS9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer8   z5You have to specify either input_ids or inputs_embedsr   r   )rJ   r5  rU   r  r6   r   r  r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frn   r  r  zThe `z` should be specified for z layers, but it is for .)r   r~  r  r  r  r  r   r  r   r   )rQ   rR   rS   rT   cross_attentions)r   r   r+  r  r1  r   r   r?   r%   r  r  r2   r!   r
   r   r   r   r(  r   r   ziplenr'  r2  r)   randr  r3  r4  r  r   )rc   rJ   r   rU   r  r~  r  r  r  rR   r5  r   r  r   r+  r,  r   rl  r6   rS   all_hidden_statesall_self_attnsall_cross_attentionspresent_key_value_states	attn_mask	mask_namer8  decoder_layerdropout_probabilityr  r:  s                                  r1   r   Kosmos2TextTransformer.forward  s!   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"#//K!r;r?;I&',,.s3KTUU DSC^!3A!6!<!<Q!?de "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12HJ]J]grsugv%w"--mt||VZVcVc-d&&4==##p "	 #7BD0d&7<Q<]rdh)22  %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! ! *!,M(]@Q1WX-Y,[[(  =#3"55(4(]1-=,??(a #9f 6  -!118+4+%1
 	
r3   )	r   r   r  r  r  r(  r  r  r'  )NNNr   NNNNNNNNNNNNNNNN)ro   rp   rq   rr   rs   r   r   r  r   r)   r   rF   r  r   r   rt   r*   r   r   r	   r   r   r   rv   r   r   s   @r1   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!F  -115/3=A8<9=,07;=A04/3$(,0/3&*!U
ELL)U
 !.U
 u||,	U

 %-U\\$:U
  (5U
 !) 6U
 ELL)U
 'u||4U
 "$u'8'8"9:U
  -U
 u||,U
 D>U
 $D>U
 'tnU
  d^!U
" -.#U
$ 
u??	@%U
 U
r3   r  c                   6    \ rS rSr\rSrSS/rSrSr	Sr
S rSrg)Kosmos2PreTrainedModeli  Tr  r  c                 @   [        U [        5      (       a  U R                  R                  nO;[        U [        [
        45      (       a   U R                  R                  R                  n[        U [        [        45      (       a  U R                  R                  nO;[        U [        [
        45      (       a   U R                  R                  R                  n[        U[        5      (       a  [        R                  R                  UR                  SUR                   S-  W-  S9  [        R                  R                  UR"                  R$                  UR                  R&                  U-  S9  [        R                  R                  UR(                  R$                  UR                  R&                  U-  S9  g[        U[*        5      (       Ga  UR                   S-  SUR                  R,                  -  S-  -  W-  nUR                   S-  U-  n[        R                  R                  UR.                  R$                  US9  [        R                  R                  UR0                  R$                  US9  [        R                  R                  UR2                  R$                  US9  [        R                  R                  UR4                  R$                  US9  UR.                  R6                  b.  UR.                  R6                  R8                  R;                  5         UR0                  R6                  b.  UR0                  R6                  R8                  R;                  5         UR2                  R6                  b.  UR2                  R6                  R8                  R;                  5         UR4                  R6                  b/  UR4                  R6                  R8                  R;                  5         gg[        U[<        5      (       GaH  UR                  R>                  S-  SUR                  R,                  -  S-  -  W-  nSUR                  R>                  -  S-  U-  n[        R                  R                  UR@                  R$                  US9  [        R                  R                  URB                  R$                  US9  UR@                  R6                  b.  UR@                  R6                  R8                  R;                  5         URB                  R6                  b/  URB                  R6                  R8                  R;                  5         gg[        U[D        5      (       a  URF                  R6                  R8                  R;                  5         URF                  R$                  R8                  RI                  S5        URJ                  R6                  R8                  R;                  5         URJ                  R$                  R8                  RI                  S5        g[        U[L        5      (       a  URN                  R6                  R8                  R;                  5         URN                  R$                  R8                  RI                  S5        URP                  R6                  R8                  R;                  5         URP                  R$                  R8                  RI                  S5        g[        U[R        5      (       Ga  [        R                  R                  UR.                  R$                  WS9  [        R                  R                  UR0                  R$                  US9  [        R                  R                  UR2                  R$                  US9  [        R                  R                  UR4                  R$                  US9  UR.                  R6                  b.  UR.                  R6                  R8                  R;                  5         UR0                  R6                  b.  UR0                  R6                  R8                  R;                  5         UR2                  R6                  b.  UR2                  R6                  R8                  R;                  5         UR4                  R6                  b/  UR4                  R6                  R8                  R;                  5         gg[        U[T        5      (       a  [        R                  R                  UR@                  R$                  WS9  [        R                  R                  URB                  R$                  US9  UR@                  R6                  b.  UR@                  R6                  R8                  R;                  5         URB                  R6                  b/  URB                  R6                  R8                  R;                  5         gg[        U[        5      (       ay  [        R                  R                  URV                  R$                  WS9  URV                  R6                  b/  URV                  R6                  R8                  R;                  5         gg[        U[X        5      (       ay  [        R                  R                  URZ                  R$                  WS9  URZ                  R6                  b/  URZ                  R6                  R8                  R;                  5         gg[        U[\        5      (       a  UR^                  R$                  R8                  R                  SWS9  UR^                  R`                  bF  UR^                  R$                  R8                  UR^                  R`                     R;                  5         ggg)zInitialize the weightsr   r   )meanstd)r  r   Nr$   )1
isinstanceKosmos2VisionModelr   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr   r
   initnormal_r   r   r   r   initializer_ranger   r   r&  r   r   r   r   r   datazero_r  r   r
  r  r  r  fill_r  r>  r@  rB  rr  r  lm_headKosmos2ImageToTextProjectiondenser  r  rK   )rc   r   factorr  in_proj_stdout_proj_stdfc_stds          r1   _init_weights$Kosmos2PreTrainedModel._init_weights  s   d.//[[33F|-LMNN[[..AAFd-/EFGG++&&C|-LMNN++))22Cf566GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 677!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE}}!!-""''--/}}!!-""''--/}}!!-""''--/##/$$))//1 0 011!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?zz*

$$**,zz*

$$**, + 9::##((..0%%**005##((..0%%**005 899$$))//1&&++11#6!!&&++113!!((--33C8 344GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O<}}!!-""''--/}}!!-""''--/}}!!-""''--/##/$$))//1 0//GGOOFJJ--3O7GGOOFJJ--3O7zz*

$$**,zz*

$$**, + 677GGOOFNN11sO;~~"".##((..0 / <==GGOOFLL//SO9||  ,!!&&,,. - 677&&++33#3F""..:##**//0C0C0O0OPVVX ; 8r3   rn   N)ro   rp   rq   rr   r   config_classsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpar  rv   rn   r3   r1   r  r    s1     L&*#46HI"&!NQYr3   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )r  i	  r   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r>  model	post_initr   s     r1   r   Kosmos2VisionModel.__init__  s&     -f5
r3   rX   c                 B    U R                   R                  R                  $ r  )r  r   r   rk   s    r1   get_input_embeddings'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r3   r   r+  r   r,  c                 (    U R                  UUUUUS9$ )N)r   r   r+  r   r,  r  )rc   r   r   r+  r   r,  s         r1   r   Kosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r3   r  rI  )ro   rp   rq   rr   r   r  main_input_namer   r
   Moduler  r   r   r)   rt   r*   r	   r   r   r   rv   r   r   s   @r1   r  r  	  s    &L$O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r3   r  c            '       ,  ^  \ rS rSr\rS\4U 4S jjrS\R                  4S jr	S r
\\               SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\R"                        S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\\4   4"S jj5       5       rSrU =r$ )r  i*  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r  )r   r   r  r  r  r   s     r1   r   Kosmos2TextModel.__init__-  s&     +F3
r3   rX   c                 .    U R                   R                  $ r  r  r  rk   s    r1   r  %Kosmos2TextModel.get_input_embeddings3      zz&&&r3   c                 $    XR                   l        g r  r	  rc   r   s     r1   set_input_embeddings%Kosmos2TextModel.set_input_embeddings6      "'

r3   rJ   r   rU   r  r~  r  r  r  rR   r5  r   r  r   r+  r,  r   c                 F    U R                   " SUUUUUUUUU	U
UUUUUS.UD6$ )a4  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
rJ   r   rU   r  r~  r  r  r  rR   r5  r   r  r   r+  r,  rn   r  )rc   rJ   r   rU   r  r~  r  r  r  rR   r5  r   r  r   r+  r,  r   s                    r1   r   Kosmos2TextModel.forward9  sR    H zz 
)%'A"7#9!5+'%/!5#
  !
 	
r3   r  r  )ro   rp   rq   rr   r   r  r   r
   r  r  r  r   r   r   r)   r   r   rt   r*   r   r   r	   r   r   r   rv   r   r   s   @r1   r  r  *  s   $L0 'bii '(  -115/3=A8<9=,07;=A04/3$(,0/3&*!3
ELL)3
 !.3
 u||,	3

 %-U\\$:3
  (53
 !) 63
 ELL)3
 'u||43
 "$u'8'8"9:3
  -3
 u||,3
 D>3
 $D>3
 'tn3
  d^!3
" -.#3
$ 
u??	@%3
  3
r3   r  c                       \ rS rSrSrg)KwargsForCausalLMiq  rn   N)ro   rp   rq   rr   rv   rn   r3   r1   r  r  q  s    3r3   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            )         ^  \ rS rSr\rS/rS\4U 4S jjrS\R                  4S jr
S rS\R                  4S jrS	 r\\                SS
\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\\R(                        S\\R$                     S\\R$                     S\\R*                     S\\   S\\   S\\   S\\   S\\   S\\\4   4$S jj5       5       r      S U 4S jjr\S 5       rSr U =r!$ )!r  it  zlm_head.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NF)in_featuresout_featuresr   )
r   r   r  r  r
   r   r   r  r  r  r   s     r1   r   Kosmos2TextForCausalLM.__init__~  sI     +F3
yyV-=-=FL]L]dij 	r3   rX   c                 .    U R                   R                  $ r  r	  rk   s    r1   r  +Kosmos2TextForCausalLM.get_input_embeddings  r  r3   c                 $    XR                   l        g r  r	  r  s     r1   r  +Kosmos2TextForCausalLM.set_input_embeddings  r  r3   c                     U R                   $ r  r  rk   s    r1   get_output_embeddings,Kosmos2TextForCausalLM.get_output_embeddings  s    ||r3   c                     Xl         g r  r!  rc   new_embeddingss     r1   set_output_embeddings,Kosmos2TextForCausalLM.set_output_embeddings  s    %r3   rJ   r   rU   r  r~  r  r  r  rR   r5  r   labelsr  r   r+  r,  r   c                    Ub  UOU R                   R                  nUb  U(       a  [        R                  S5        SnU R                  " S	UUUUUUUUU	U
UUUUSS.UD6nU R                  US   5      nSnUb)  U R                  " S	UXR                   R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )
a  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FTr  r   )r{   r)  r  )rz   r{   rR   rS   rT   r  rn   )r   r1  r   warningr  r  loss_functionr  r   rR   rS   rT   r  )rc   rJ   r   rU   r  r~  r  r  r  rR   r5  r   r)  r  r   r+  r,  r   r  	lm_logitsrz   s                        r1   r   Kosmos2TextForCausalLM.forward  s    R &1%<k$++B]B]klI** 
)%'A"7#9!5+'%/!5
  !
$ LL,	%%sYvR]R]RhRhslrsD0#33!//))$55
 	
r3   c                 f  > [        UU R                  R                  SS9n	Ub  S nS nOoUbl  UR                  5       u  pUR                  5       S   n[        R
                  " U[        R                  " XU-
  4[        R                  UR                  S94SS9n[        TU ](  " U4UUUUUU	US.UD6nU$ )Nr   )rK   r6   r8   )r%   r!   r5   r   r:   )rR   r   rU   r  r  r   cache_position)rM   r   r  r%   r)   r@   rA   r*   r5   r   prepare_inputs_for_generation)rc   rJ   rU   r  rR   r   r  r0  model_kwargsr   r   rh  mask_lenmodel_inputsr   s                 r1   r1  4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     :00#$
 &L)-&'3"+.."2J1668<H)..KKjH2D%EUZZ`i`p`pq *& w<

+)%'A%)

 

 r3   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nrn   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)rf  r'   r5   )ra   
past_statebeam_idxs     r1   rd   8Kosmos2TextForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)ri   )rR   r9  reordered_past
layer_pasts    `  r1   _reorder_cache%Kosmos2TextForCausalLM._reorder_cache  s:     )Jncmnn N * r3   )r  r  )NNNNNNNNNNNNNNNN)NNNNNN)"ro   rp   rq   rr   r   r  _tied_weights_keysr   r
   r  r  r  r"  r'  r   r   r   r)   r   r   rt   
LongTensorr*   r   r  r	   r   r   r   r1  ro  r=  rv   r   r   s   @r1   r  r  t  s#    %L*+0 'bii '(ryy &  -115/3=A8<9=,07;=A04/3-1$(,0/3&*#M
ELL)M
 !.M
 u||,	M

 %-U\\$:M
  (5M
 !) 6M
 ELL)M
 'u||4M
 "$u'8'8"9:M
  -M
 u||,M
 ))*M
 D>M
 $D>M
  'tn!M
" d^#M
$ *+%M
& 
u77	8'M
  M
d #'/b  r3   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )r  i   zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r   c           	        > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  5      U l        [        R                  " [        R                  " UR                  UR                  R                  5      5      U l        [        UR                  UR                  R                  UR                  R                  UR                  R                   SSS9U l        g )NF)r   rs  rt  )r   r   r
   r   r  r   r  r   r  r   r)   r   latent_query_numlatent_queryrr  r  r   x_attnr   s     r1   r   %Kosmos2ImageToTextProjection.__init__#  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r3   c                     U R                  U5      nU R                  R                  S5      R                  UR	                  S5      SS5      n[
        R                  " X#/SS9nU R                  UUS S S S9u  p%nX%4$ )Nr   r8   r   r:   )rS   r~  r  r   r   )r  rD  r   r&   r%   r)   r@   rE  )rc   featuresrS   rD  key_value_statesr   r   s          r1   r   $Kosmos2ImageToTextProjection.forward1  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m%BJ)-&"2" *5 *
&Q **r3   )r  rD  rE  )
ro   rp   rq   rr   rs   r   r   r   rv   r   r   s   @r1   r  r     s    w
} 
+ +r3   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %       4  ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
S r  SS\R                  S\\   S	\\   4S
 jjr\\              SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\\R                        S\\R&                     S\\R&                     S\\R&                     S\\   S\\   S\\   S	\S\\   S\\   S\\\4   4 S jj5       5       rSrU =r$ )r  iC  r   r   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r  )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r1   r   Kosmos2Model.__init__L  sN     *6+=+=>.v/C/CD(DV(L% 	r3   rX   c                 B    U R                   R                  R                  $ r  rM  r  r  rk   s    r1   r  !Kosmos2Model.get_input_embeddingsV      $$111r3   c                 8    XR                   R                  l        g r  rR  r  s     r1   r  !Kosmos2Model.set_input_embeddingsY      -2*r3   return_attentionsr   c                     U R                  UUS9nU R                   R                  R                  US   5      n[        R                  R                  USS9nU R                  U5      u  pVU(       a  XV4$ U$ )a  
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    return_attentions (`bool`, *optional*, defaults to `False`):
        Whether to return `projection_attentions` or not.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate positional embeddings or not.
)r   r   r   r8   r:   )rN  r  rB  r
   r   	normalizerO  )rc   r   rX  r   rW   rU   rV   s          r1   get_image_featuresKosmos2Model.get_image_features\  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y+66r3   rJ   r  r   r  rR   rU   r5  r   r  r   r+  r,  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSnUc"  Uc  [	        S5      eU R                  USUS9u  nnU R                  " SUUUUUUUU	U
UUSS.UD6n[        UR                  UR                  UR                  UR                  UUUS9$ )aM  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Kosmos2Model

>>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> text = (
...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
...     "</object>"
... )

>>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

>>> last_hidden_state = model(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
... ).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 91, 2048]
```N<You have to specify either `pixel_values` or `image_embeds`.T)rX  r   )rJ   r   rU   r  r  rR   r5  r   r  r   r+  r,  )rQ   rR   rS   rT   rU   rV   rW   rn   )r   r   r+  r1  r   r[  rM  rO   rQ   rR   rS   rT   )rc   r   rJ   r  r   r  rR   rU   r5  r   r  r   r+  r   r,  r   rW   rV   r  s                      r1   r   Kosmos2Model.forward{  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ // 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r3   rO  rM  rN  )FF)NNNNNNNNNNNNFN)ro   rp   rq   rr   r   r  r  r   r
   r  r  r  r)   rt   r   r*   r[  r   r   r   r   r   r   r	   r   rO   r   rv   r   r   s   @r1   r  r  C  s    !L$O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0=A/304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "$u'8'8"9:a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r3   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            %         ^  \ rS rSr\rSrS/rS\4U 4S jjrS\	R                  4S jrS rS\	R                  4S	 jrS
 r\\              SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\\R*                        S\\R&                     S\\R&                     S\\R&                     S\\R,                     S\\   S\\   S\\   S\\   S\\   S\\\4   4 S jj5       5       r     SS\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R&                     4
S jjrSrU =r $ )r  i  r   ztext_model.lm_head.weightr   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        [        U5      U l	        U R                  5         g r  )r   r   r  r  rM  r  r  rN  r  rO  r  r   s     r1   r   (Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	r3   rX   c                 B    U R                   R                  R                  $ r  rR  rk   s    r1   r  4Kosmos2ForConditionalGeneration.get_input_embeddings  rT  r3   c                 8    XR                   R                  l        g r  rR  r  s     r1   r  4Kosmos2ForConditionalGeneration.set_input_embeddings  rW  r3   c                 6    U R                   R                  5       $ r  )rM  r"  rk   s    r1   r"  5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r3   c                 :    U R                   R                  U5        g r  )rM  r'  r%  s     r1   r'  5Kosmos2ForConditionalGeneration.set_output_embeddings   s    --n=r3   rJ   r  r   r  rR   rU   r5  r   r)  r  r   r+  r,  r   c                 j   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSnUcz  Uc  [	        S5      eU R                  UUUUS9nU R
                  R                  R                  US   5      n[        R                  R                  USS9nU R                  U5      u  nnU R                  " S
UUUUUUUU	U
UUUSS.UD6n[        UR                  UR                  UR                   UR"                  UR$                  UUUS	9$ )a	  
image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
    1]`:

    - 1 for places where to put the image features,
    - 0 for places that are not for image features (i.e. for text tokens).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

>>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> prompt = "<grounding> An image of"

>>> inputs = processor(text=prompt, images=image, return_tensors="pt")

>>> generated_ids = model.generate(
...     pixel_values=inputs["pixel_values"],
...     input_ids=inputs["input_ids"],
...     attention_mask=inputs["attention_mask"],
...     image_embeds=None,
...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
...     use_cache=True,
...     max_new_tokens=64,
... )
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
>>> processed_text
'<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

>>> caption, entities = processor.post_process_generation(generated_text)
>>> caption
'An image of a snowman warming himself by a fire.'

>>> entities
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
```Nr^  )r   r   r+  r,  r   r8   r:   T)rJ   r   rU   r  r  rR   r5  r   r)  r  r   r+  r,  )rz   r{   rR   rS   rT   rU   rV   rW   rn   )r   r   r+  r1  r   rN  r  rB  r
   r   rZ  rO  rM  rx   rz   r{   rR   rS   rT   )rc   r   rJ   r  r   r  rR   rU   r5  r   r)  r  r   r+  r,  r   rW   rV   
lm_outputss                      r1   r   'Kosmos2ForConditionalGeneration.forward  su   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``"&"3"3)"3%9'	 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r3   c                    UR                  SS 5      nUb  Ub  [        SU S35      eUc  Ub  UnUcj  U R                  U5      nU R                  R                  R	                  US   5      n[
        R                  R                  USS9nU R                  U5      u  pYU R                  R                  " SUUUUS.UD6n
U
$ )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r8   r:   )rJ   r   rU   r  rn   )popr   rN  r  rB  r
   r   rZ  rO  rM  generate)rc   r   r  rJ   r   rU   r   rp  rW   rV   outputs              r1   rr  (Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L)) 
)%'A	

 
 r3   r`  )NNNNNNNNNNNNNNr<  )!ro   rp   rq   rr   r   r  r  r?  r   r
   r  r  r  r"  r'  r   r   r   r)   r   r   rt   r@  r*   r   r  r	   r   rx   r   rr  rv   r   r   s   @r1   r  r    sC    !L$O56	} 	2bii 237ryy 7>  04,0=A15,0=A/304/3-1$(,0/3&*x
u||,x
 ELL)x
 %-U\\$:	x

 !.x
 ELL)x
 "$u'8'8"9:x
 u||,x
  -x
 u||,x
 ))*x
 D>x
 $D>x
 'tnx
 d^x
  *+!x
" 
u@@	A#x
  x
x 04=A,015/3#u||,# %-U\\$:# ELL)	#
 !.# u||,# #r3   r  )r  r  r  r  )r   )r   )Mrs   rY  dataclassesr   typingr   r   r   r   r   r	   r)   torch.utils.checkpointr
   activationsr   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_kosmos2r   r   r   
get_loggerro   r   r   r!   rF   r2   Sizer5   rC   rM   rO   rx   r  r   r]  r   r   r  r  r"  r>  rK  rr  r  r  r  r  r  r  r  r  r  r  r  __all__rn   r3   r1   <module>r     s     ! > >    ! ) B  G & b b X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  3
 3
 3
l 6
 6
 6
tPbii Pv %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  /		 /f^
299 ^
D3
ryy 3
nUcryy Ucpv9")) v9rRYY .oryy odl
RYY l
^ YY_ YY YYx
/ 
BD
- D
N ?,j > c3_ ccL +299  +F 
V
) V

V
r {&<o {{| Xr3   