o
    Zh                     @   sL  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. e& rd dl/m
  m0Z1 e'2e3Z4e$G dd de Z5eG dd deZ6eG dd deZ7eG dd deZ8G dd de
j9Z:de	j;de<d e	j;fd!d"Z=	#dWd$e
j9d%e	j;d&e	j;d'e	j;d(ee	j; d)e>d*e>fd+d,Z?G d-d. d.e
j9Z@G d/d0 d0e
j9ZAG d1d2 d2eZBG d3d4 d4e
j9ZCe$G d5d6 d6e5ZDG d7d8 d8e
j9ZEG d9d: d:e
j9ZFG d;d< d<e
j9ZGG d=d> d>e
j9ZHG d?d@ d@e
j9ZIG dAdB dBe
j9ZJG dCdD dDe
j9ZKG dEdF dFe
j9ZLG dGdH dHe
j9ZMe$dIdJG dKdL dLe5ZNG dMdN dNe
j9ZOG dOdP dPe
j9ZPe$dQdJG dRdS dSe5ZQG dTdU dUe5eZRg dVZSdS )X    N)	dataclass)CallableListOptionalTupleUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availablelogging	torch_int   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   @   sF   e Zd ZeZdZdZdgZddgZdZ	dZ
dZdZdZdZdd Zd	S )
JanusPreTrainedModelmodelTZLlamaDecoderLayerpast_key_valuescausal_maskFc                 C   s   t | jdr| jjjn| jj}t|tjtjfr0|jj	j
d|d |jd ur.|jj	  d S d S t|tjtjfrH|jj	  |jj	d d S t|tjrg|jj	j
d|d |jd uri|jj	|j   d S d S d S )Nvision_config        )meanstdg      ?)hasattrconfigr(   Zinitializer_range
isinstancer   LinearConv2dweightdataZnormal_biasZzero_	GroupNorm	LayerNormZfill_	EmbeddingZpadding_idx)selfmoduler+    r9   W/var/www/auris/lib/python3.10/site-packages/transformers/models/janus/modeling_janus.py_init_weightsF   s$   


z"JanusPreTrainedModel._init_weightsN)__name__
__module____qualname__r!   config_classZbase_model_prefixZsupports_gradient_checkpointing_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_quantized_cacheZ_supports_cache_class_supports_static_cacheZ!_supports_param_buffer_assignmentr;   r9   r9   r9   r:   r$   8   s    r$   c                   @   s2   e Zd ZU dZdZeej ed< dZ	ejed< dS )JanusVQVAEOutputaM  
    Base class for Janus VQ-VAE mode model outputs.
    Args:
        decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            Reconstructed pixel values after encoding and decoding the input.
        embedding_loss (`torch.FloatTensor`):
            Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
r<   r=   r>   __doc__rC   r   torchFloatTensor__annotations__rD   r9   r9   r9   r:   rB   Y   s   
 	rB   c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dS )JanusBaseModelOutputWithPasta	  
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater&   hidden_states
attentionsimage_hidden_states)r<   r=   r>   rE   rJ   r   rF   rG   rH   r&   r   rK   rL   rM   r9   r9   r9   r:   rI   h   s   
 $rI   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	JanusCausalLMOutputWithPasta  
    Base class for Janus causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr&   rK   rL   rM   )r<   r=   r>   rE   rO   r   rF   rG   rH   rP   r&   r   rK   r   rL   rM   r9   r9   r9   r:   rN      s   
  rN   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )JanusVisionEmbeddingsr-   c                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )NZvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r    F)
persistent)super__init__r-   hidden_size	embed_dimZ
image_size
patch_sizer   r0   num_channelspatch_embeddingnum_patchesnum_positionsr6   position_embeddingZregister_bufferrF   Zarangeexpandr7   r-   	__class__r9   r:   r[      s    
"zJanusVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r    r   rX   g      ?r	   r   ZbicubicF)sizemodeZalign_corners)shaperc   r1   rF   Zjit
is_tracingrW   	unsqueezer^   r   reshapepermuter   
functionalinterpolateview)r7   rh   ri   rj   ra   rb   Zpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr9   r9   r:   interpolate_pos_encoding   s&   




z.JanusVisionEmbeddings.interpolate_pos_encodingFpixel_valuesrw   c           
      C   sh   |j \}}}}| jjj}| |j|d}|ddd}|r(| |||}	n| | j	}	||	 }|S )N)dtyper   r    )
rn   r`   r1   ry   toflatten	transposerw   rc   rW   )
r7   rx   rw   _ri   rj   Ztarget_dtypeZpatch_embedsrh   Z
pos_embedsr9   r9   r:   forward   s   
zJanusVisionEmbeddings.forwardF)r<   r=   r>   r"   r[   rF   Tensorintrw   boolr~   __classcell__r9   r9   rf   r:   rQ      s    $&rQ   rK   n_reprk   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rn   rd   rq   )rK   r   batchZnum_key_value_headsslenhead_dimr9   r9   r:   	repeat_kv
  s
   0r   r)   r8   querykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r	   rX   )rv   ry   )ptrainingr    )r   num_key_value_groupsrF   matmulr|   rn   r   rs   softmaxZfloat32rz   ry   r   r   
contiguous)r8   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsr'   attn_outputr9   r9   r:   eager_attention_forward  s   
&r   c                
       sX   e Zd ZdZdef fddZ		ddejdeej deej d	e	e
 fd
dZ  ZS )JanusVisionAttentionz(Attention Class for Janus Vision Encoderr-   c                    sL  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr    r3   r   )rZ   r[   r-   r\   r]   Znum_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutZuse_qk_norm	is_causalr   r   r/   Zattention_biasq_projk_projv_projprojection_layerDropoutZIdentityr5   q_normk_norm)r7   r-   Zproj_dropoutZqk_normrf   r9   r:   r[   3  s0   

$zJanusVisionAttention.__init__NrK   r   output_attentionsr   c                 K   sl  |  \}}}| |}| |}	| |}
|d| j| j}| |}|	d| j| j}	| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkr|| jjdkrv|ddrvtd nt| jj }|| ||	|
|f| jsd	n| j| j| jd
|\}}|||| j}| |}| |}|r||f}|S |d f}|S )NrX   r    r   eagerZsdpar   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r)   )r   r   r   )rl   r   r   r   rq   r   r   r   r   r|   ru   r   r-   Z_attn_implementationgetloggerwarning_oncer   r   r   r   r   r]   r   r   )r7   rK   r   r   r   
batch_sizeseq_lenr}   query_statesr   r   Zattention_interfacer   r   outputoutputsr9   r9   r:   r~   P  sL   




	


zJanusVisionAttention.forward)NN)r<   r=   r>   rE   r"   r[   rF   r   r   r   r   r~   r   r9   r9   rf   r:   r   0  s     r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )JanusVisionMLPr-   c                    sr   t    || _t|j|j | _t|j | _	t
|j| j| _t
| j|j| _t
|j| _t
|j| _d S N)rZ   r[   r-   r   r\   Z	mlp_ratioZintermediate_sizer
   
hidden_actactivation_fnr   r/   fc1fc2r   Zhidden_dropout_ratedropout1dropout2re   rf   r9   r:   r[     s   
zJanusVisionMLP.__init__rK   rk   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   r   r7   rK   r9   r9   r:   r~     s   




zJanusVisionMLP.forward)	r<   r=   r>   r"   r[   rF   r   r~   r   r9   r9   rf   r:   r     s    
r   c                
       sN   e Zd Zdef fddZ	ddejdejdee de	ej
 fd	d
Z  ZS )JanusVisionEncoderLayerr-   c                    sX   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _|| _d S N)eps)rZ   r[   r\   r]   r   r5   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr-   re   rf   r9   r:   r[     s   



z JanusVisionEncoderLayer.__init__FrK   r   r   rk   c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rK   r   r   )r   r   r   r   )r7   rK   r   r   residualr   r   r9   r9   r:   r~     s    




zJanusVisionEncoderLayer.forwardr   )r<   r=   r>   r"   r[   rF   r   r   r   r   rG   r~   r   r9   r9   rf   r:   r     s    r   c                
       sZ   e Zd ZdZdef fddZe			ddeej	 dee
 dee
 d	efd
dZ  ZS )JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r-   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r9   )r   .0r}   r-   r9   r:   
<listcomp>  s    z/JanusVisionEncoder.__init__.<locals>.<listcomp>F)	rZ   r[   r-   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingre   rf   r   r:   r[     s   
 
zJanusVisionEncoder.__init__Nr   r   output_hidden_statesrk   c           
      C   s   |dur|n| j j}|dur|n| j j}|rdnd}|rdnd}|}| jD ]}|r.||f }||||d}	|	d }|rB||	d f }q%|rJ||f }t|||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr9   )r   r   r    )rJ   rK   rL   )r-   r   r   r   r   )
r7   inputs_embedsr   r   r   Zencoder_statesZall_attentionsrK   Zencoder_layerZlayer_outputsr9   r9   r:   r~     s2   


zJanusVisionEncoder.forwardNNN)r<   r=   r>   rE   r"   r[   r   r   rF   r   r   r   r~   r   r9   r9   rf   r:   r     s     r   c                       s~   e Zd ZdZeZdef fddZe					ddee	j
 dee dee d	ee d
edeeef fddZdd Z  ZS )JanusVisionModelrx   r-   c                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )rZ   r[   r-   r\   rQ   rh   r   encoderr   r5   r   post_layernorm	post_init)r7   r-   r]   rf   r9   r:   r[     s   

zJanusVisionModel.__init__NFr   r   return_dictrw   rk   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| j||||d}|d }| |}|d d dd d f }	| |	}	|s[||	f|dd   S t||	|j	|j
dS )Nz You have to specify pixel_values)rw   )r   r   r   r   r   r    )rJ   Zpooler_outputrK   rL   )r-   r   r   Zuse_return_dictr   rh   r   r   r   rK   rL   )
r7   rx   r   r   r   rw   rK   Zencoder_outputsrJ   Zpooled_outputr9   r9   r:   r~   *  s2   	

zJanusVisionModel.forwardc                 C      | j S r   )rh   r7   r9   r9   r:   get_input_embeddingsU     z%JanusVisionModel.get_input_embeddings)NNNNF)r<   r=   r>   main_input_namer"   r?   r[   r   r   rF   rG   r   r   r   r   r~   r   r   r9   r9   rf   r:   r     s0    
*r   c                       *   e Zd Zdef fddZdd Z  ZS )JanusVisionAlignerMLPr-   c                    N   t    t j j| _t fddtd j	D | _
t j | _d S )Nc                       g | ]
}t  j jqS r9   r   r/   projection_dimr   r   r9   r:   r   _      z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>r    )rZ   r[   r   r/   r\   r   r   r   r   depthhidden_layersr
   r   r   re   rf   r   r:   r[   Z     
zJanusVisionAlignerMLP.__init__c                 C   ,   |  |}| jD ]}| |}||}q|S r   r   r   r   r7   rK   layerr9   r9   r:   r~   c  
   



zJanusVisionAlignerMLP.forward)r<   r=   r>   r"   r[   r~   r   r9   r9   rf   r:   r   Y      	r   c                       sL   e Zd ZdZdef fddZdejfddZdej	d	ej
fd
dZ  ZS )JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r-   c                    sL   t    |j| _|j| _t|dd| _t| j| j| _	|j
gd | _d S )Nbetag      ?r   )rZ   r[   num_embeddingsr]   embedding_dimgetattrr   r   r6   	embeddingra   quant_state_dimsre   rf   r9   r:   r[   v  s   
z"JanusVQVAEVectorQuantizer.__init__hidden_statec              
   C   s   | dddd }|d| j}tj|d dddtj| jjd dd dtd	|| jj	dd  }tj
|dd}| ||j}t| | d | jt||  d   }|||   }| dddd }|||fS )
Nr   r   r	   r    rX   T)rv   Zkeepdimrv   z	bd,dn->bn)rr   r   ru   r   rF   sumr   r1   Zeinsumr|   Zargminrn   r*   detachr   )r7   r   Zhidden_state_flattenedZ	distancesZmin_encoding_indiceshidden_state_quantrO   r9   r9   r:   r~     s    
z!JanusVQVAEVectorQuantizer.forwardimage_tokensrk   c                 C   sb   |j d }| jjj d }| |}tj|ddd}||g| j|R }|dddd }|S )Nr   rX   r   )r   rv   r	   r    )	rn   r   r1   F	normalizeru   r   rr   r   )r7   r   r   Zemb_dimr   r9   r9   r:   get_codebook_entry  s   

z,JanusVQVAEVectorQuantizer.get_codebook_entry)r<   r=   r>   rE   r#   r[   rF   r   r~   
LongTensorrG   r   r   r9   r9   rf   r:   r   k  s
    
	r   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )JanusVQVAEResnetBlockNFc                    s   t    || _|d u r|n|| _|| _tjjd|ddd| _tjj	||dddd| _
tjjd|ddd| _tj|j| _tjj	||dddd| _| j| jkro| jratjj	||dddd| _d S tjj	||dddd| _d S d S )	N    ư>TZ
num_groupsr_   r   Zaffiner	   r    rT   rU   rV   r   )rZ   r[   rR   rS   use_conv_shortcutrF   r   r4   norm1r0   conv1norm2r   r   conv2conv_shortcutnin_shortcut)r7   r-   rR   rS   r
  rf   r9   r:   r[     s   
zJanusVQVAEResnetBlock.__init__c                 C   s   |}|  |}|t|9 }| |}| |}|t|9 }| |}| |}| j| jkr@| j	r;| 
|}|| S | |}|| S r   )r  rF   sigmoidr  r  r   r	  rR   rS   r  r
  r  )r7   rK   r   r9   r9   r:   r~     s   






zJanusVQVAEResnetBlock.forward)NFr<   r=   r>   r[   r~   r   r9   r9   rf   r:   r     s
    r   c                       $   e Zd Z fddZdd Z  ZS )JanusVQVAEAttnBlockc                    s   t    || _tjjd|ddd| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S )Nr  r  Tr  r    r   r  )rZ   r[   rR   rF   r   r4   normr0   qkvproj_outr7   rR   rf   r9   r:   r[     s   
zJanusVQVAEAttnBlock.__init__c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
||||	}| |}|| S )Nr   r   r    r   r   )r  r  r  r  rn   rq   rr   rF   Zbmmr   r   r   r  )r7   rK   r   r   r   r   r   channelsri   rj   r   r   r9   r9   r:   r~     s    




zJanusVQVAEAttnBlock.forwardr  r9   r9   rf   r:   r    s    
r  c                       r  )JanusVQVAEConvDownsamplec                    s$   t    tj||dddd| _d S )Nr	   r   r   r  )rZ   r[   r   r0   convr  rf   r9   r:   r[     s   
z!JanusVQVAEConvDownsample.__init__c                 C   s    t j|dddd}| |}|S )N)r   r    r   r    Zconstantr   )padrm   r   )r   r  r  r   r9   r9   r:   r~     s   
z JanusVQVAEConvDownsample.forwardr  r9   r9   rf   r:   r        r  c                       r  )JanusVQVAEConvUpsamplec                    s&   t    tjj||dddd| _d S )Nr	   r    r  )rZ   r[   rF   r   r0   r  r  rf   r9   r:   r[     s   
zJanusVQVAEConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )Ng       @Znearest)Zscale_factorrm   )r   rt   r  r   r9   r9   r:   r~   
  s   
zJanusVQVAEConvUpsample.forwardr  r9   r9   rf   r:   r    r  r  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	JanusVQVAEMidBlockr-   r  c                    s8   t    t|||d| _t|| _t|||d| _d S )Nr-   rR   rS   )rZ   r[   r   block_1r  attn_1block_2)r7   r-   r  rf   r9   r:   r[     s   

zJanusVQVAEMidBlock.__init__rK   rk   c                 C   "   |  |}| |}| |}|S r   )r  r  r   r   r9   r9   r:   r~        


zJanusVQVAEMidBlock.forward)
r<   r=   r>   r#   r   r[   rF   r   r~   r   r9   r9   rf   r:   r    s    r  c                       s,   e Zd Z fddZdejfddZ  ZS )JanusVQVAEEncoderc              	      sn  t    t|j| _|j| _|j}|j}|j}|j	}|j}t
jj||dddd| _dt| }|| _t | _t| jD ]T}t }	t }
|||  }|||  }t| jD ]}|	t|||d |}|| jd krt|
t| qXt }|	|_|
|_|| jd krt||_| j| q=t||| _t
jjd|ddd	| _t
jj||rd
| n|dddd| _d S )Nr	   r    r  )r    r  r  r  Tr  r   ) rZ   r[   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrR   double_latentlatent_channelsrF   r   r0   conv_intuplein_channel_multiplierr   downr   appendr   r  Moduleblockattnr  
downsampler  midr4   norm_outconv_out)r7   r-   r(  rR   r)  r*  r%  r-  i_levelr1  r2  block_in	block_outi_blockr.  rf   r9   r:   r[   '  sX   


zJanusVQVAEEncoder.__init__rx   c                 C   s   |  |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr4| j| j| |}|| q|| jd krN|| j| 	|d  q|d }| 
|}| |}|t|9 }| |}|S )NrX   r   r    )r+  r   r&  r'  r.  r1  r$  r2  r/  r3  r4  r5  rF   r  r6  )r7   rx   rK   r7  r:  r   rJ   r9   r9   r:   r~   Z  s$   


zJanusVQVAEEncoder.forward)r<   r=   r>   r[   rF   r   r~   r   r9   r9   rf   r:   r#  &  s    3r#  c                       s2   e Zd Z fddZdejdejfddZ  ZS )JanusVQVAEDecoderc              	      sP  t    t|j| _|j| _|j}|j}|j}||j| jd   }t	j
j||dddd| _t||| _t
 | _tt| jD ]N}t
 }t
 }||j|  }	t| jd D ]}
|t|||	d |	}|| jd krt|t| qXt
 }||_||_|dkrt||_| j| q@t	j
jd|ddd	| _t	j
j||dddd| _d S )
Nr    r	   r  r  r   r  r  Tr  )rZ   r[   r$  r%  r&  r'  r(  r*  rS   rF   r   r0   r+  r  r4  r   upreversedr   r/  r   r  r0  r1  r2  r  upsampler4   r5  r6  )r7   r-   r(  r*  rS   r8  r7  r1  r2  r9  r:  r<  rf   r9   r:   r[   t  sD   


zJanusVQVAEDecoder.__init__r   rk   c                 C   s   |  |}| |}t| jD ]9}t| jd D ] }| j| j| |}t| j| jdkr8| j| j| |}q|| jd krH| j| 	|}q| 
|}|t|9 }| |}|S )Nr    r   )r+  r4  r   r&  r'  r<  r1  r$  r2  r>  r5  rF   r  r6  )r7   r   r7  r:  r9   r9   r:   r~     s   



zJanusVQVAEDecoder.forward)r<   r=   r>   r[   rF   rG   r~   r   r9   r9   rf   r:   r;  s  s    .r;  aG  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )Zcustom_introc                       s   e Zd ZeZg dZdZdef fddZdej	fddZ
dej	d	ejfd
dZeedejd	eejejf fddZ  ZS )
JanusVQVAE)r  r   r   rx   r-   c                    sp   t  | t|| _t|| _tj|j	|j
d| _tj|j
|j	d| _|   t|| _d| _|   d S )Nr    F)rZ   r[   r#  r   r   quantizerF   r   r0   r*  r]   
quant_convpost_quant_convevalr;  decoderr   r   re   rf   r9   r:   r[     s   


zJanusVQVAE.__init__c                 C   s.   |  |}| |}| |\}}}|||fS r   )r   rA  r@  )r7   rx   rK   quantZemb_lossindicesr9   r9   r:   encode  s   


zJanusVQVAE.encoder   rk   c                 C   sr   |j d | jjd | jjd  kr'td| jjd | jjd   d|j  d| j|}| |}| |}|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r    r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)rn   r@  r   r   r   rB  rD  )r7   r   Zcodebook_entryrK   rx   r9   r9   r:   decode  s   "	

zJanusVQVAE.decodec                 C   s:   |j d }| |\}}}| ||d}t||}|S )Nr   rX   )rn   rG  rH  ru   rB   )r7   rx   r   rE  rD   rF  rC   r   r9   r9   r:   r~     s
   

zJanusVQVAE.forward)r<   r=   r>   r#   r?   r@   r   r[   rF   r   rG  rG   rH  r   r   r   r~   r   r9   r9   rf   r:   r?    s    r?  c                       r   )JanusVQVAEAlignerMLPr-   c                    r   )Nc                    r   r9   r   r   r   r9   r:   r     r   z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>r    )rZ   r[   r   r/   r]   r   r   r   r   r   r   r
   r   r   re   rf   r   r:   r[     r   zJanusVQVAEAlignerMLP.__init__c                 C   r   r   r   r   r9   r9   r:   r~     r   zJanusVQVAEAlignerMLP.forward)r<   r=   r>   r#   r[   r~   r   r9   r9   rf   r:   rI    r   rI  c                       s<   e Zd ZdZdef fddZdejdejfddZ	  Z
S )	JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r-   c                    s>   t    t|j|j| _t|j | _	t|j|j
| _d S r   )rZ   r[   r   r/   Zimage_token_embed_dimr   r  r
   r   r   r   vision_headre   rf   r9   r:   r[     s   
zJanusVQVAEHead.__init__rK   rk   c                 C   r!  r   )r  r   rK  r   r9   r9   r:   r~     r"  zJanusVQVAEHead.forward)r<   r=   r>   rE   r#   r[   rF   r   Ztensorr~   r   r9   r9   rf   r:   rJ    s    rJ  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	dde
jde
jdee
j dee
j dee dee
j dee
j dee dee dee deee
jf fddZ  ZS )
JanusModelr-   c                    s   t  | || _t|j| _t| jj| _t	|j
| _t| jjj| jjj| _t| jj| _t| jj| _tj|jd| _d| _|   d S )Nr   F)rZ   r[   r-   r   Z_from_configr(   vision_modelr   alignerr?  Z	vq_configvqmodelr   r6   r   r]   generation_embeddingsrI  generation_alignerrJ  generation_headr   Zfrom_configtext_configlanguage_modelr   r   re   rf   r9   r:   r[   $  s   zJanusModel.__init__c                 C   s
   | j  S r   )rT  r   r   r9   r9   r:   r   9     
zJanusModel.get_input_embeddingsc                 C   s   | j | d S r   )rT  set_input_embeddingsr7   r   r9   r9   r:   rV  <  s   zJanusModel.set_input_embeddingsc                 C   s   |  |}| |j}|S r   )rM  rN  rJ   )r7   rx   image_embedsr9   r9   r:   get_image_features?  s   
zJanusModel.get_image_featuresNr   	input_idsrx   r   rW   r&   cache_positionr   	use_cacher   r   logits_to_keepc                 K   s<  |	d ur|	n| j j}	|
d ur|
n| j j}
|d u |d uA r td| jr/| jr/|r/td d}|d ur;|d ur;td|d u rE|  |}|d urw| 	|}|| j j
k}|jd }|d|}|ddd|}||j|j}|||}| jd||||||	|
||d	|}t|j|j|j|j|d ur|nd d}|S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either onerX   )	r   r   rW   r&   r\  r   r   r[  r]  )rJ   r&   rK   rL   rM   r9   )r-   r   r   r   r   r   r   r   r   rY  Zimage_token_idrn   rq   rp   rd   rz   devicery   Zmasked_scatterrT  rI   rJ   r&   rK   rL   )r7   rZ  rx   r   rW   r&   r[  r   r\  r   r   r]  r   rX  Zimage_attention_maskr]   Zimage_featuresZ	lm_outputr   r9   r9   r:   r~   D  s`   


zJanusModel.forward)NNNNNNNNNNr   )r<   r=   r>   r!   r[   r   rV  rY  r   r   rF   r   rG   r   r   r   r   r   r   r~   r   r9   r9   rf   r:   rL    sR    	
rL  c                       sj  e Zd ZddgZdZdef fddZdd Zd	d
 Zde	j
de	j
fddZdd Zdd Zdd Zdd Zee												d/de	jde	jdee	j
 dee	j dee dee	j dee	j d ee	j d!ee d"ee d#ee d$eee	j
f fd%d&Z						d0 fd'd(	Zd)e	j
fd*d+Ze	j			d1de	j
dee	j d,ee f fd-d.Z  ZS )2JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr-   c                    sB   t  | || _t|| _tj|jj|jj	dd| _
|   d S )NFr   )rZ   r[   r-   rL  r%   r   r/   rS  r\   
vocab_sizelm_headr   re   rf   r9   r:   r[     s
   
z&JanusForConditionalGeneration.__init__c                 C   s   | j j S r   )r%   rT  r   r   r9   r9   r:   r     s   z2JanusForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r%   rT  rV  rW  r9   r9   r:   rV    s   z2JanusForConditionalGeneration.set_input_embeddingsinputsrk   c                 C   s   | j |}| j |}|S r   )r%   rP  rQ  )r7   rb  r   r9   r9   r:   'prepare_embeddings_for_image_generation  s   zEJanusForConditionalGeneration.prepare_embeddings_for_image_generationc                 C   r   r   ra  r   r9   r9   r:   get_output_embeddings  r   z3JanusForConditionalGeneration.get_output_embeddingsc                 C   
   || _ d S r   rd  )r7   Znew_embeddingsr9   r9   r:   set_output_embeddings  rU  z3JanusForConditionalGeneration.set_output_embeddingsc                 C   rf  r   r%   )r7   rD  r9   r9   r:   set_decoder  rU  z)JanusForConditionalGeneration.set_decoderc                 C   r   r   rh  r   r9   r9   r:   get_decoder  r   z)JanusForConditionalGeneration.get_decoderNr   rZ  rx   r   rW   r&   r[  r   labelsr\  r   r   r]  c                 K   s   |
dur|
n| j j}
|dur|n| j j}| jd|||||||	|
||d
|}|j}t|tr5t| dn|}| |dd|ddf }d}|durV| j	||| j j
jd}t|||j|j|j|jd}|S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
rZ  rx   r   rW   r&   r   r\  r   r   r[  )rP   rk  r`  )rO   rP   r&   rK   rL   rM   r9   )r-   r   r   r%   rJ   r.   r   slicera  Zloss_functionrS  r`  rN   r&   rK   rL   rM   )r7   rZ  rx   r   rW   r&   r[  r   rk  r\  r   r   r]  r   r   rK   Zslice_indicesrP   rO   r   r9   r9   r:   r~     s@   z%JanusForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r&   r   r   r[  r]  r   rx   )rZ   prepare_inputs_for_generation)
r7   rZ  rx   r&   r   r   r[  r]  r   model_inputsrf   r9   r:   rm    s   z;JanusForConditionalGeneration.prepare_inputs_for_generationr   c                 C   s"   | j j|}|dddd}|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r	   r    )r%   rO  rH  rr   )r7   r   Zdecoded_imager9   r9   r:   decode_image_tokens  s   z1JanusForConditionalGeneration.decode_image_tokenslogits_processorc           %         sz  | d| j}t|}| dd}|dkr$t jd|||d d|S |jdi |}| tj	tj
fvr:td|  | |  |d urK|nt }d|d< |jd u r_td d	|_|j|d
< | ||j|\}}	}|j|j}
}t|jdkrtd|j d|d u}| j|||jd |jr|jdkr|t|j d |_| j||jd |d ||d}| jd|||jd|\}}| jjj j!}|j\}}|"dd}| dd }|"dd}||d< ||d d d f |jk||d d d f |j#d k@ }||d d d f $||j% | & |}| '|||}|(dd d u r=| j)|j*p,d|d t+|j,|| ||d|d< t-j.||f|
|d}|j/}|j0}|j1}|j2}|j3}|r^|r^dnd }|rh|rhdnd }|rr|rrdnd }|r||r|dnd }t4|D ]}| j5d||d|}|d 6|j|d< |d 6|j|d< | jj7di |||d}| 8||}|j9d d dd d f : } | j;| }!|||!}"|j<rt-j=|"dd}#t-j>|#dd?d}$nt-j@|"dd}$|$|d d |f< t-A|$|$g}$|$Bd}$| C|$}q|r-|r||!f7 }|r|| D f7 }|r%||jE7 }|r-||jF7 }|r;tG||!||||jHdS |S ) Ngeneration_configgeneration_modetext)rb  r   rq  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr\  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   rt  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r^  r    )rq  Zinput_ids_seq_lengthZencoder_input_idsZprefix_allowed_tokens_fnrp  r^  )rZ  r   Zexpand_sizer   Zboi_token_idr&   Zstatic)cache_implementationr   Zmax_cache_lenr^  model_kwargs)ry   r^  r9   )r   rZ  r[  )r   r   rX   r   )Znum_samples)	sequencesscoresrP   rL   rK   r&   )Ipoprq  copydeepcopyrZ   generateupdateZget_generation_moder   ZSAMPLEZGREEDY_SEARCHr   validateZ_validate_model_kwargsr   rt  r   warningZ_prepare_model_inputsZbos_token_idry   r^  r$  rn   Z_prepare_special_tokensr/  r   Z_get_logits_processorZ_expand_inputs_for_generationZnum_return_sequencesr%   rM  r-   num_image_tokensrepeatZgeneration_kwargsZmasked_fill_Zpad_token_idr   Z_get_initial_cache_positionr   Z
_get_cacherv  max
max_lengthrF   Zzerosr   r   output_scoresoutput_logitsreturn_dict_in_generater   rm  rz   rT  Z#_update_model_kwargs_for_generationrJ   clonerR  Z	do_sampler   ZmultinomialZsqueezeZargmaxcatrp   rc  floatrL   rK   r   r&   )%r7   rb  r   rp  r   rq  rr  rw  rZ  Zmodel_input_namery   r^  Zkwargs_has_attention_maskr  r   r   Zinput_tokensmaskr   Zgenerated_tokensr   r   r  r  r  Z
raw_scoresZ
raw_logitsZdecoder_hidden_statesZdecoder_attentionsirn  r   r   ry  Znext_token_scoresZprobsZ
next_tokenrf   r9   r:   r}    s   	
















	z&JanusForConditionalGeneration.generate)NNNNNNNNNNNr   )NNNNNNr   ) r<   r=   r>   Z_tied_weights_keysrA   r!   r[   r   rV  rF   r   rc  re  rg  ri  rj  r   r   r   rG   r   r   r   r   r   r~   rm  ro  Zno_gradr   r}  r   r9   r9   rf   r:   r_    s    		
>r_  )r$   r_  rL  r?  r   )r)   )Tr{  dataclassesr   typingr   r   r   r   r   rF   r   Zactivationsr
   Zcache_utilsr   Z
generationr   r   r   r   Zgeneration.utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   autor   Zconfiguration_janusr!   r"   r#   Ztorch.nn.functionalrs   r   Z
get_loggerr<   r   r$   rB   rI   rN   r0  rQ   r   r   r   r  r   r   r   r   r   r   r   r   r   r  r  r  r  r#  r;  r?  rI  rJ  rL  r_  __all__r9   r9   r9   r:   <module>   s   
 ,)K
U1P>?,#MD>n  N