o
    Zh                    @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZNmOZOmPZP e6 rd dlZd dlQmZ d dlRm  mSZT d dlUZe7 rd dlVZVddlWmXZX ddl9mYZYmZZZ e8[e\Z]G dd deLZ^G d d! d!e>Z_G d"d# d#eXZ`e4G d$d% d%e0ZaeG d&d' d'e-ZbeG d(d) d)eGZceG d*d+ d+eHZdG d,d- d-ePZeG d.d/ d/ejfZgG d0d1 d1ejfZhG d2d3 d3eOZiG d4d5 d5eNZjG d6d7 d7e<ZkG d8d9 d9ejfZlG d:d; d;eEZmG d<d= d=eDZnG d>d? d?eBZoG d@dA dAeCZpG dBdC dCejfZqG dDdE dEejfZrG dFdG dGeAejfZsG dHdI dIejfZtG dJdK dKe@ZuG dLdM dMejfZvG dNdO dOejfZwe4dPdQG dRdS dSeaZxG dTdU dUeaeZyG dVdW dWeZzg dXZ{dS )Y    N)	dataclass)CallableDictIterableListOptionalTupleUnion)nn)BlipImageProcessor   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)resizeto_channel_dimension_format)ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_array)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availableis_vision_availablelogging   )	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddings)PretrainedConfig)CONFIG_MAPPING
AutoConfigc                       sN   e Zd ZdZdZdZ									
												d fdd	Z  ZS )JanusVisionConfiga
  
    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
    `JanusVisionModel` according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for attention weights.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of MLP hidden dimensionality to embedding dimensionality.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys, and values in the attention layers.
        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout probability for fully connected layers in the encoder.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        projection_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for the projection layer.
        use_qk_norm (`bool`, *optional*, defaults to `False`):
            Whether to normalize the query and key matrices.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated normal initializer for initializing all weight matrices.
        depth (`int`, *optional*, defaults to 2):
            Number of hidden layers in the aligner module.
        num_image_tokens (`int`, *optional*, defaults to 576):
            Number of image tokens.
    Zjanus_vision_modelvision_config         r             ư>gelu      @T   F{Gz?r)   @  c                    sd   t  jd|||||||||	d	| | `|
| _|| _|| _|| _|| _|| _|| _	|| _
|| _d S )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrJ   rK   rL   rM   rN   rO   rP   rQ   rR   rW   rX   rY   rZ   r[   r\   r]   r^   r_   kwargs	__class__rS   V/var/www/auris/lib/python3.10/site-packages/transformers/models/janus/modular_janus.pyrU      s.   

zJanusVisionConfig.__init__)r?   r@   rA   r   rA   rB   rC   rD   rE   rF   TrC   rG   rC   FrH   r)   rI   )__name__
__module____qualname____doc__
model_typeZbase_config_keyrU   __classcell__rS   rS   rb   rd   r=   Q   s.    .r=   c                       sx   e Zd ZdZddddddddg d	d
dddd
ddfdededededededededee dedef fddZ  Z	S )JanusVQVAEConfiga:
  
    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a
    configuration with the defaults will yield a similar configuration to the VQModel of the
    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

    Args:
        embed_dim (`int`, *optional*, defaults to 8):
            Dimensionality of each embedding vector.
        num_embeddings (`int`, *optional*, defaults to 16384):
            Number of codebook embeddings.
        double_latent (`bool`, *optional*, defaults to `False`):
            Whether to use double z channels.
        latent_channels (`int`, *optional*, defaults to 256):
            Number of channels for the latent space.
        num_patches (`int`, *optional*, defaults to 32):
            Num of patches the input images can be divided into.
        in_channels (`int`, *optional*, defaults to 3):
            Number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            Number of out channels.
        base_channels (`int`, *optional*, defaults to 128):
            Base channel count.
        channel_multiplier (`List[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
            Channel multipliers for each resolution.
        num_res_blocks (`int`, *optional*, defaults to 2):
            Number of residual blocks.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in VAVAE MLP Connecter module.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        image_token_embed_dim (`int`, *optional*, defaults to 2048):
            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
       i @  F       r      )   rp   r)   r)      r)   rC   rH   rG   rE   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                    s\   t  jd|||||||	|
||d
| || _|| _|| _|| _|| _|| _| `| `	| `
d S )N)
rr   rs   rt   ru   rw   ry   rz   r{   r|   r]   rS   )rT   rU   rv   rx   rZ   rK   rR   image_token_embed_dim
resolutionZattn_resolutionsZ	attn_type)r`   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r]   rZ   rK   rR   r}   ra   rb   rS   rd   rU      s.   zJanusVQVAEConfig.__init__)
re   rf   rg   rh   intboolr   floatrU   rj   rS   rS   rb   rd   rk      sR    .	
rk   c                       s:   e Zd ZdZdZeeedZ				d fdd	Z	  Z
S )	JanusConfiga;  
    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
            The config object or dictionary of the vision backbone.
        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
            The config object or dictionary of the VQVAE backbone.
        image_token_id (`int`, *optional*, defaults to 100581):
            Token index of a placeholder image token.

    Example:

    ```python
    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

    >>> # Initializing a Janus vision config
    >>> vision_config = JanusVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a VQ config
    >>> vq_config = JanusVQVAEConfig()

    >>> # Initializing a Janus Pro 1B style configuration
    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

    >>> # Initializing a model from the Janus Pro 1B style configuration
    >>> model = JanusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zjanus)text_configr>   	vq_configN c                    s`  t |tr|dd|d< t|d  d	i || _n"|d u r*td td  | _nt |tr3|| _n	tdt	| |d u rJtd t
 | _n t |trXt
d	i || _nt |t
ra|| _n	tdt	| |d u rxtd t | _n t |trtd	i || _nt |tr|| _n	tdt	| | jj| jj | j_|| _t jd	i | d S )
Nri   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rS   )
isinstancedictgetr;   r   loggerinfor:   
ValueErrortyper=   r>   rk   r   rO   rN   rv   image_token_idrT   rU   )r`   r   r>   r   r   ra   rb   rS   rd   rU   A  sP   










zJanusConfig.__init__)NNNr   )re   rf   rg   rh   ri   r<   r=   rk   Zsub_configsrU   rj   rS   rS   rb   rd   r     s    -r   c                   @   sF   e Zd ZeZdZdZdgZddgZdZ	dZ
dZdZdZdZdd Zd	S )
JanusPreTrainedModelmodelTZLlamaDecoderLayerpast_key_valuesZcausal_maskFc                 C   s   t | jdr| jjjn| jj}t|tjtjfr0|jj	j
d|d |jd ur.|jj	  d S d S t|tjtjfrH|jj	  |jj	d d S t|tjrg|jj	j
d|d |jd uri|jj	|j   d S d S d S )Nr>   rC   )meanstd      ?)hasattrconfigr>   r]   r   r
   LinearConv2dweightdataZnormal_biasZzero_	GroupNorm	LayerNormZfill_	EmbeddingZpadding_idx)r`   moduler   rS   rS   rd   _init_weights  s$   


z"JanusPreTrainedModel._init_weightsN)re   rf   rg   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointing_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2Z_supports_sdpaZ_supports_quantized_cacheZ_supports_cache_class_supports_static_cacheZ!_supports_param_buffer_assignmentr   rS   rS   rS   rd   r   y  s    r   c                   @   s2   e Zd ZU dZdZeej ed< dZ	ejed< dS )JanusVQVAEOutputaM  
    Base class for Janus VQ-VAE mode model outputs.
    Args:
        decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            Reconstructed pixel values after encoding and decoding the input.
        embedding_loss (`torch.FloatTensor`):
            Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
re   rf   rg   rh   r   r   torchFloatTensor__annotations__r   rS   rS   rS   rd   r     s   
 	r   c                   @      e Zd ZdS )JanusBaseModelOutputWithPastNre   rf   rg   rS   rS   rS   rd   r         r   c                   @   r   )JanusCausalLMOutputWithPastNr   rS   rS   rS   rd   r     r   r   c                   @   s(   e Zd ZddejdedejfddZdS )	JanusVisionEmbeddingsFpixel_valuesinterpolate_pos_encodingreturnc           
      C   sh   |j \}}}}| jjj}| |j|d}|ddd}|r(| |||}	n| | j	}	||	 }|S )Ndtyper)   rp   )
shapeZpatch_embeddingr   r   toflatten	transposer   Zposition_embeddingposition_ids)
r`   r   r   _heightwidthZtarget_dtypeZpatch_embedsZ
embeddingsZ
pos_embedsrS   rS   rd   forward  s   
zJanusVisionEmbeddings.forwardN)F)re   rf   rg   r   Tensorr   r   rS   rS   rS   rd   r     s     r   c                
       sX   e Zd ZdZdef fddZ		ddejdeej deej d	e	e
 fd
dZ  ZS )JanusVisionAttentionz(Attention Class for Janus Vision Encoderr   c                    sL  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frp   r   r   )rT   rU   r   rJ   rr   rL   	num_headshead_dimr   scalerP   r[   r\   	is_causalZnum_key_value_groupsr
   r   rX   q_projk_projv_projprojection_layerDropoutZIdentityr   q_normk_norm)r`   r   Zproj_dropoutZqk_normrb   rS   rd   rU     s0   

$zJanusVisionAttention.__init__Nhidden_statesattention_maskoutput_attentionsra   c                 K   sl  |  \}}}| |}| |}	| |}
|d| j| j}| |}|	d| j| j}	| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkr|| jjdkrv|ddrvtd nt| jj }|| ||	|
|f| jsd	n| j| j| jd
|\}}|||| j}| |}| |}|r||f}|S |d f}|S )Nrp   r)   eagerZsdpar   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rC   )r|   Zscalingr   )sizer   r   r   reshaper   r   r   r   r   viewr5   r   Z_attn_implementationr   r   warning_oncer!   trainingrP   r   r   rr   r   r[   )r`   r   r   r   ra   
batch_sizeseq_lenr   Zquery_statesZ
key_statesZvalue_statesZattention_interfaceZattn_outputZattn_weightsoutputoutputsrS   rS   rd   r     sL   




	


zJanusVisionAttention.forward)NN)re   rf   rg   rh   r=   rU   r   r   r   r#   r   r   rj   rS   rS   rb   rd   r     s     r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )JanusVisionMLPr   c                    sr   t    || _t|j|j | _t|j | _	t
|j| j| _t
| j|j| _t
|j| _t
|j| _d S N)rT   rU   r   r   rJ   rW   rV   r   rR   activation_fnr
   r   fc1fc2r   rY   dropout1dropout2r`   r   rb   rS   rd   rU     s   
zJanusVisionMLP.__init__r   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   r   r`   r   rS   rS   rd   r   $  s   




zJanusVisionMLP.forward)	re   rf   rg   r=   rU   r   r   r   rj   rS   rS   rb   rd   r     s    
r   c                       "   e Zd Zdef fddZ  ZS )JanusVisionEncoderLayerr   c                    sX   t    || _|j| _t|| _tj| j|j	d| _
tj| j|j	d| _t|| _d S )N)eps)rT   rU   r   rJ   rr   r   Z	self_attnr
   r   rQ   Zlayer_norm1Zlayer_norm2r   Zmlpr   rb   rS   rd   rU   .  s   

z JanusVisionEncoderLayer.__init__re   rf   rg   r=   rU   rj   rS   rS   rb   rd   r   -      r   c                       r   )JanusVisionEncoderr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS rS   )r   .0r   r   rS   rd   
<listcomp>;  s    z/JanusVisionEncoder.__init__.<locals>.<listcomp>)rT   rU   r
   
ModuleListrangerK   Zlayersr   rb   r   rd   rU   9  s   $zJanusVisionEncoder.__init__r   rS   rS   rb   rd   r   8  r   r   c                       r   )JanusVisionModelr   c                    s   t  | t|| _d S r   )rT   rU   r   encoderr   rb   rS   rd   rU   ?  s   zJanusVisionModel.__init__r   rS   rS   rb   rd   r   >  r   r   c                       *   e Zd Zdef fddZdd Z  ZS )JanusVisionAlignerMLPr   c                    N   t    t j j| _t fddtd j	D | _
t j | _d S )Nc                       g | ]
}t  j jqS rS   r
   r   rZ   r   r   rS   rd   r   J      z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>rp   )rT   rU   r
   r   rJ   rZ   r   r   r   r^   hidden_layersr   rR   r   r   rb   r   rd   rU   E     
zJanusVisionAlignerMLP.__init__c                 C   ,   |  |}| jD ]}| |}||}q|S r   r   r   r   r`   r   layerrS   rS   rd   r   N  
   



zJanusVisionAlignerMLP.forward)re   rf   rg   r=   rU   r   rj   rS   rS   rb   rd   r   D      	r   c                       s8   e Zd Zdef fddZdejdejfddZ  Z	S )JanusVQVAEVectorQuantizerr   c                    s   t  | |jgd | _d S )Nr)   )rT   rU   rv   quant_state_dimsr   rb   rS   rd   rU   W  s   z"JanusVQVAEVectorQuantizer.__init__image_tokensr   c                 C   sb   |j d }| jjj d }| |}tj|ddd}||g| j|R }|dddd }|S )Nr   r   r)   )pdimr   rp   )	r   Z	embeddingr   F	normalizer   r   permute
contiguous)r`   r   r   Zemb_dimZhidden_state_quantrS   rS   rd   get_codebook_entry[  s   

z,JanusVQVAEVectorQuantizer.get_codebook_entry)
re   rf   rg   rk   rU   r   
LongTensorr   r  rj   rS   rS   rb   rd   r   V  s    r   c                   @   r   )JanusVQVAEResnetBlockNr   rS   rS   rS   rd   r	  k      r	  c                   @   r   )JanusVQVAEAttnBlockNr   rS   rS   rS   rd   r  o  r
  r  c                   @   r   )JanusVQVAEConvDownsampleNr   rS   rS   rS   rd   r  s  r
  r  c                       s$   e Zd Z fddZdd Z  ZS )JanusVQVAEConvUpsamplec                    s&   t    tjj||dddd| _d S )Nr   rp   Zkernel_sizeZstridepadding)rT   rU   r   r
   r   conv)r`   rw   rb   rS   rd   rU   x  s   
zJanusVQVAEConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )Ng       @Znearest)Zscale_factormode)r  Zinterpolater  r   rS   rS   rd   r   |  s   
zJanusVQVAEConvUpsample.forward)re   rf   rg   rU   r   rj   rS   rS   rb   rd   r  w  s    r  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	JanusVQVAEMidBlockr   channelsc                    s8   t    t|||d| _t|| _t|||d| _d S )Nr   rw   rx   )rT   rU   r	  block_1r  attn_1block_2)r`   r   r  rb   rS   rd   rU     s   

zJanusVQVAEMidBlock.__init__r   r   c                 C   "   |  |}| |}| |}|S r   )r  r  r  r   rS   rS   rd   r        


zJanusVQVAEMidBlock.forward)
re   rf   rg   rk   r   rU   r   r   r   rj   rS   rS   rb   rd   r    s    r  c                   @   s$   e Zd Zdd ZdejfddZdS )JanusVQVAEEncoderc              	   C   sn  t j  t|j| _|j| _|j}|j}|j	}|j
}|j}tj j||dddd| _dt| }|| _t  | _t| jD ]T}t  }	t  }
|||  }|||  }t| jD ]}|	t|||d |}|| jd krt|
t| qXt  }|	|_|
|_|| jd krt||_| j| q=t||| _tj jd|ddd	| _tj j||rd
| n|dddd| _d S )Nr   rp   r  )rp   r  rn   rD   TZ
num_groupsrM   r   Zaffiner)   )r
   ModulerU   lenrz   num_resolutionsr{   ry   rw   rt   ru   r   r   conv_intuplein_channel_multiplierr   downr   appendr	  r  blockattnr  
downsampler  midr   norm_outconv_out)r`   r   ry   rw   rt   ru   rz   r!  i_levelr$  r%  block_in	block_outi_blockr"  rS   rS   rd   rU     sX   


zJanusVQVAEEncoder.__init__r   c                 C   s   |  |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr4| j| j| |}|| q|| jd krN|| j| 	|d  q|d }| 
|}| |}|t|9 }| |}|S )Nr   r   rp   )r  r   r  r{   r"  r$  r  r%  r#  r&  r'  r(  r   sigmoidr)  )r`   r   r   r*  r-  hidden_statelast_hidden_staterS   rS   rd   r     s$   


zJanusVQVAEEncoder.forwardN)re   rf   rg   rU   r   r  r   rS   rS   rS   rd   r    s    3r  c                       s2   e Zd Z fddZdejdejfddZ  ZS )JanusVQVAEDecoderc              	      sP  t    t|j| _|j| _|j}|j}|j}||j| jd   }t	j
j||dddd| _t||| _t
 | _tt| jD ]N}t
 }t
 }||j|  }	t| jd D ]}
|t|||	d |	}|| jd krt|t| qXt
 }||_||_|dkrt||_| j| q@t	j
jd|ddd	| _t	j
j||dddd| _d S )
Nrp   r   r  r  r   rn   rD   Tr  )rT   rU   r  rz   r  r{   ry   ru   rx   r   r
   r   r  r  r'  r   upreversedr   r#  r	  r  r  r$  r%  r  upsampler   r(  r)  )r`   r   ry   ru   rx   r+  r*  r$  r%  r,  r-  r2  rb   rS   rd   rU     sD   


zJanusVQVAEDecoder.__init__r/  r   c                 C   s   |  |}| |}t| jD ]9}t| jd D ] }| j| j| |}t| j| jdkr8| j| j| |}q|| jd krH| j| 	|}q| 
|}|t|9 }| |}|S )Nrp   r   )r  r'  r   r  r{   r2  r$  r  r%  r4  r(  r   r.  r)  )r`   r/  r*  r-  rS   rS   rd   r     s   



zJanusVQVAEDecoder.forward)re   rf   rg   rU   r   r   r   rj   rS   rS   rb   rd   r1    s    .r1  c                       sl   e Zd Zg dZdZdef fddZdejdej	fdd	Z
eedej	deej	ej	f fd
dZ  ZS )
JanusVQVAE)r  r	  r   r   r   c                    s(   t  | t|| _d| _|   d S )NF)rT   rU   r1  decodergradient_checkpointing	post_initr   rb   rS   rd   rU   1  s   
zJanusVQVAE.__init__r   r   c                 C   sr   |j d | jjd | jjd  kr'td| jjd | jjd   d|j  d| j|}| |}| |}|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        rp   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer   r   r  Zpost_quant_convr6  )r`   r   Zcodebook_entryr   r   rS   rS   rd   decode9  s   "	

zJanusVQVAE.decodec                 C   s:   |j d }| |\}}}| ||d}t||}|S )Nr   r   )r   encoder:  r   r   )r`   r   r   Zquantr   indicesr   r   rS   rS   rd   r   L  s
   

zJanusVQVAE.forward)re   rf   rg   r   Zmain_input_namerk   rU   r   r  r   r:  r%   r$   r   r   rj   rS   rS   rb   rd   r5  )  s    r5  c                       r   )JanusVQVAEAlignerMLPr   c                    r   )Nc                    r   rS   r   r   r   rS   rd   r   `  r   z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>rp   )rT   rU   r
   r   rr   rZ   r   r   r   rK   r   r   rR   r   r   rb   r   rd   rU   [  r   zJanusVQVAEAlignerMLP.__init__c                 C   r   r   r   r   rS   rS   rd   r   d  r   zJanusVQVAEAlignerMLP.forward)re   rf   rg   rk   rU   r   rj   rS   rS   rb   rd   r=  Z  r   r=  c                       s<   e Zd ZdZdef fddZdejdejfddZ	  Z
S )	JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                    s>   t    t|j|j| _t|j | _	t|j|j
| _d S r   )rT   rU   r
   r   r}   rZ   proj_outr   rR   r   rs   vision_headr   rb   rS   rd   rU   o  s   
zJanusVQVAEHead.__init__r   r   c                 C   r  r   )r?  r   r@  r   rS   rS   rd   r   u  r  zJanusVQVAEHead.forward)re   rf   rg   rh   rk   rU   r   r   Ztensorr   rj   rS   rS   rb   rd   r>  l  s    r>  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    )Zcustom_introc                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	dde
jde
jdee
j dee
j dee dee
j dee
j dee dee dee deee
jf fddZ  ZS )
JanusModelr   c                    s   t  | || _t|j| _t| jj| _t	|j
| _t| jjj| jjj| _t| jj| _t| jj| _tj|jd| _d| _|   d S )Nr   F)rT   rU   r   r   Z_from_configr>   vision_modelr   alignerr5  r   vqmodelr
   r   rs   rr   generation_embeddingsr=  generation_alignerr>  generation_headr*   Zfrom_configr   language_modelr7  r8  r   rb   rS   rd   rU     s   zJanusModel.__init__c                 C   s
   | j  S r   )rH  get_input_embeddingsr`   rS   rS   rd   rI       
zJanusModel.get_input_embeddingsc                 C   s   | j | d S r   )rH  set_input_embeddingsr`   valuerS   rS   rd   rL    s   zJanusModel.set_input_embeddingsc                 C   s   |  |}| |j}|S r   )rB  rC  r0  )r`   r   image_embedsrS   rS   rd   get_image_features  s   
zJanusModel.get_image_featuresNr   	input_idsr   r   r   r   cache_positioninputs_embeds	use_cacher   output_hidden_stateslogits_to_keepc                 K   s<  |	d ur|	n| j j}	|
d ur|
n| j j}
|d u |d uA r td| jr/| jr/|r/td d}|d ur;|d ur;td|d u rE|  |}|d urw| 	|}|| j j
k}|jd }|d|}|ddd|}||j|j}|||}| jd||||||	|
||d	|}t|j|j|j|j|d ur|nd d}|S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oner   )	rS  r   r   r   rT  r   rU  rR  rV  )r0  r   r   
attentionsimage_hidden_statesrS   )r   r   rU  r   r7  r   r   r   rI  rP  r   r   r   	unsqueezeexpandr   devicer   Zmasked_scatterrH  r   r0  r   r   rW  )r`   rQ  r   r   r   r   rR  rS  rT  r   rU  rV  ra   rO  Zimage_attention_maskrr   Zimage_featuresZ	lm_outputr   rS   rS   rd   r     s`   


zJanusModel.forward)NNNNNNNNNNr   )re   rf   rg   r   rU   rI  rL  rP  r%   r$   r   r  r   r   r   r   r   r	   r   r   rj   rS   rS   rb   rd   rA  |  sR    	
rA  c                       sj  e Zd ZddgZdZdef fddZdd Zd	d
 Zde	j
de	j
fddZdd Zdd Zdd Zdd Zee												d/de	jde	jdee	j
 dee	j dee dee	j dee	j d ee	j d!ee d"ee d#ee d$eee	j
f fd%d&Z						d0 fd'd(	Zd)e	j
fd*d+Ze	j			d1de	j
dee	j d,ee f fd-d.Z  ZS )2JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    sB   t  | || _t|| _tj|jj|jj	dd| _
|   d S )NFr   )rT   rU   r   rA  r   r
   r   r   rJ   
vocab_sizelm_headr8  r   rb   rS   rd   rU     s
   
z&JanusForConditionalGeneration.__init__c                 C   s   | j j S r   )r   rH  rI  rJ  rS   rS   rd   rI    s   z2JanusForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r   rH  rL  rM  rS   rS   rd   rL    s   z2JanusForConditionalGeneration.set_input_embeddingsinputsr   c                 C   s   | j |}| j |}|S r   )r   rE  rF  )r`   r_  r/  rS   rS   rd   'prepare_embeddings_for_image_generation  s   zEJanusForConditionalGeneration.prepare_embeddings_for_image_generationc                 C      | j S r   r^  rJ  rS   rS   rd   get_output_embeddings     z3JanusForConditionalGeneration.get_output_embeddingsc                 C   
   || _ d S r   rb  )r`   Znew_embeddingsrS   rS   rd   set_output_embeddings
  rK  z3JanusForConditionalGeneration.set_output_embeddingsc                 C   re  r   r   )r`   r6  rS   rS   rd   set_decoder  rK  z)JanusForConditionalGeneration.set_decoderc                 C   ra  r   rg  rJ  rS   rS   rd   get_decoder  rd  z)JanusForConditionalGeneration.get_decoderNr   rQ  r   r   r   r   rR  rS  labelsrT  r   rU  rV  c                 K   s   |
dur|
n| j j}
|dur|n| j j}| jd|||||||	|
||d
|}|j}t|tr5t| dn|}| |dd|ddf }d}|durV| j	||| j j
jd}t|||j|j|j|jd}|S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
rQ  r   r   r   r   rS  rT  r   rU  rR  )logitsrj  r]  )lossrk  r   r   rW  rX  rS   )r   r   rU  r   r0  r   r   slicer^  Zloss_functionr   r]  r   r   r   rW  rX  )r`   rQ  r   r   r   r   rR  rS  rj  rT  r   rU  rV  ra   r   r   Zslice_indicesrk  rl  r   rS   rS   rd   r     s@   z%JanusForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r   rS  r   rR  rV  r   r   )rT   prepare_inputs_for_generation)
r`   rQ  r   r   r   rS  rR  rV  ra   model_inputsrb   rS   rd   rn  P  s   z;JanusForConditionalGeneration.prepare_inputs_for_generationr   c                 C   s"   | j j|}|dddd}|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r)   r   rp   )r   rD  r:  r  )r`   r   Zdecoded_imagerS   rS   rd   decode_image_tokensn  s   z1JanusForConditionalGeneration.decode_image_tokenslogits_processorc           %         sz  | d| j}t|}| dd}|dkr$t jd|||d d|S |jdi |}| tj	tj
fvr:td|  | |  |d urK|nt }d|d< |jd u r_td d	|_|j|d
< | ||j|\}}	}|j|j}
}t|jdkrtd|j d|d u}| j|||jd |jr|jdkr|t|j d |_| j||jd |d ||d}| jd|||jd|\}}| jjj j!}|j\}}|"dd}| dd }|"dd}||d< ||d d d f |jk||d d d f |j#d k@ }||d d d f $||j% | & |}| '|||}|(dd d u r=| j)|j*p,d|d t+|j,|| ||d|d< t-j.||f|
|d}|j/}|j0}|j1}|j2}|j3}|r^|r^dnd }|rh|rhdnd }|rr|rrdnd }|r||r|dnd }t4|D ]}| j5d||d|}|d 6|j|d< |d 6|j|d< | jj7di |||d}| 8||}|j9d d dd d f : } | j;| }!|||!}"|j<rt-j=|"dd}#t-j>|#dd?d}$nt-j@|"dd}$|$|d d |f< t-A|$|$g}$|$Bd}$| C|$}q|r-|r||!f7 }|r|| D f7 }|r%||jE7 }|r-||jF7 }|r;tG||!||||jHdS |S ) Ngeneration_configgeneration_modetext)r_  r   rr  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.TrT  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   ru  r)   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r[  rp   )rr  Zinput_ids_seq_lengthZencoder_input_idsZprefix_allowed_tokens_fnrq  r[  )rQ  r   Zexpand_sizer   Zboi_token_idr   Zstatic)cache_implementationr   Zmax_cache_lenr[  model_kwargs)r   r[  rS   )rS  rQ  rR  )r   rU  r   )r  )Znum_samples)	sequencesscoresrk  rW  r   r   )Ipoprr  copydeepcopyrT   generateupdateZget_generation_moder   ZSAMPLEZGREEDY_SEARCHr   validateZ_validate_model_kwargsr   ru  r   warningZ_prepare_model_inputsZbos_token_idr   r[  r  r   Z_prepare_special_tokensr#  r   Z_get_logits_processorZ_expand_inputs_for_generationZnum_return_sequencesr   rB  r   r_   repeatZgeneration_kwargsZmasked_fill_Zpad_token_idrI  Z_get_initial_cache_positionr   Z
_get_cacherw  max
max_lengthr   zerosr   rU  output_scoresoutput_logitsreturn_dict_in_generater   rn  r   rH  Z#_update_model_kwargs_for_generationr0  clonerG  Z	do_sampleZsoftmaxZmultinomialZsqueezeZargmaxcatrY  r`  r   rW  r   r   r   )%r`   r_  r   rq  ra   rr  rs  rx  rQ  Zmodel_input_namer   r[  Zkwargs_has_attention_maskr_   r   r   Zinput_tokensmaskrS  Zgenerated_tokensr   rU  r  r  r  Z
raw_scoresZ
raw_logitsZdecoder_hidden_statesZdecoder_attentionsiro  r   r/  rz  Znext_token_scoresZprobsZ
next_tokenrb   rS   rd   r~  z  s   	
















	z&JanusForConditionalGeneration.generate)NNNNNNNNNNNr   )NNNNNN)NNN) re   rf   rg   Z_tied_weights_keysr   r   rU   rI  rL  r   r   r`  rc  rf  rh  ri  r%   r$   r  r   r   r   r   r	   r   r   rn  rp  Zno_gradr   r~  rj   rS   rS   rb   rd   r\    s    		
>r\  c                       s  e Zd ZdZdddejddddddf
dedeee	e
f  de
d	ed
edee
ef dedeeeee f  deeeee f  dee f fddZ			d"dejdee
ee
e
e
f f deee	ef  deee	ef  dejf
ddZejddfdejdeee	e
f e
f d	edeee	ef  deee	ef  dejfddZ							d#ded
ee dee dee deee  deee  dee	 dee	 fddZ	d$dejdeeee f deeee f deee	ef  dejf
d d!Z  ZS )%JanusImageProcessora
  
    Constructs a JANUS image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        min_size (`int`, *optional*, defaults to 14):
            The minimum allowed size for the resized image. Ensures that neither the height nor width
            falls below this value after resizing.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    TN   gp?	do_resizer   min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbc                    sB   t  jdi | || _|d u rd| _d S tdd |D | _d S )N)   r  r  c                 S   s   g | ]}t |d  qS )   )r   )r   xrS   rS   rd   r   u  s    z0JanusImageProcessor.__init__.<locals>.<listcomp>rS   )rT   rU   r  background_colorr   )r`   r  r   r  r  r  r  r  r  r  r  ra   rb   rS   rd   rU   a  s
   
zJanusImageProcessor.__init__r   imager  data_formatinput_data_formatr   c                 C   s  t ||\}}|tjkr|jd n|jd }||kr*|dur&t|||}|S |}|S t||}t|tr8|g}nt||krFt	d| d|tjkrt
j|||f|jd}	t|D ]\}
}||	|
ddddf< qZ||kr|| d }||	dd||| ddf< |	S || d }||	dddd||| f< |	S t
j|||f|jd}	t|D ]\}
}||	dddd|
f< q||kr|| d }||	||| ddddf< |	S || d }||	dd||| ddf< |	S )a}  
        Pads an image to a square based on the longest edge.

        Args:
            image (`np.ndarray`):
                The image to pad.
            background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in mutli-channel mode, it will default to `0` in subsequent channels.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r)   )r   r   ZFIRSTr   r   r  r   r   r  r   npr  r   	enumerate)r`   r  r  r  r  r   r   rM   Zmax_dimresultr  colorstartrS   rS   rd   pad_to_squarew  sL   




z!JanusImageProcessor.pad_to_squarec                 K   s   |du rt |}t||\}}t||}	t|dd}|d |d kr0td|d  d|d  |d }||	 }
tt||
 | jtt||
 | jg}t|f||||d|}| j|| j	|d	}|S )
a  
        Resize an image to dynamically calculated size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `None`: will be inferred from input
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        NT)Zdefault_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   r  r  r  )r  r  r  )
r   r   r  r   r   r   r  r   r  r  )r`   r  r   r  r  r  ra   r   r   max_sizedeltaZoutput_size_nonpaddedrS   rS   rd   r     s<   !
	zJanusImageProcessor.resizeimagesreturn_tensorsc	                 C   sR  |dur|n| j }|du rd| j n|}|dur|n| j}|dur#|n| j}|dur,|n| j}t|}t|d tjjrHt	|dkrD|S |d S |du rRt
|d }g }	|D ]@}
t|
}
|rg| j|
|||d}
|r{| j|
||d}
|
ddtj}
|r|r|dkrt|
tj|d	}
tj|
}
|	|
 qVd
|	i}|dkr|nd}t||dS )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Nr   r   rp   )r  r  r  r  )r   r  r  zPIL.Image.Image)Zinput_channel_dimr   )r   Ztensor_type)r  r  r  r  r  r   r   PILZImager  r   r   unnormalizeZrescaleZclipZastyper  Zuint8r   r   ZLASTZ	fromarrayr#  r   )r`   r  r  r  r  r  r  r  r  r   r  r   rS   rS   rd   postprocess  s6   zJanusImageProcessor.postprocessc                 C   s   d}t |trt||krtd| dt| n|g| }t |tr7t||kr6td| dt| n|g| }tdd t||D }tdd |D }| j||||d}|S )	a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r   zmean must have z$ elements if it is an iterable, got zstd must have c                 s   s    | ]
\}}| | V  qd S r   rS   )r   r   r   rS   rS   rd   	<genexpr>_  s    z2JanusImageProcessor.unnormalize.<locals>.<genexpr>c                 s   s    | ]}d | V  qdS )rp   NrS   )r   r   rS   rS   rd   r  `  s    )r  r   r   r  )r   r   r  r   r   zipr  )r`   r  r  r  r  rM   Zrev_image_meanZrev_image_stdrS   rS   rd   r  9  s"   



zJanusImageProcessor.unnormalize)r   NN)NNNNNNNr   )re   rf   rg   rh   r   ZBICUBICr   r   r   strr   r	   r   r   rU   r  Zndarrayr   r   arrayr  r   r   r  r   r  rj   rS   rS   rb   rd   r  ;  s    '
	

N
H

	
8r  )	r  r   r\  rA  r5  r   rk   r=   r   )|r|  dataclassesr   typingr   r   r   r   r   r   r	   numpyr  r   r
   Z.transformers.models.blip.image_processing_blipr   Zactivationsr   Zcache_utilsr   Z
generationr   r   r   r   Zgeneration.utilsr   Zimage_processing_utilsr   r   Zimage_transformsr   r   Zimage_utilsr   r   r   r   r   r   r   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr    Zmodeling_utilsr!   r"   Zprocessing_utilsr#   utilsr$   r%   r&   r'   r(   autor*   Zblip_2.modeling_blip_2r+   Z!chameleon.configuration_chameleonr,   Zchameleon.modeling_chameleonr-   r.   r/   r0   r1   r2   Zidefics.modeling_ideficsr3   r4   Zllama.modeling_llamar5   Zsiglip.configuration_siglipr6   Zsiglip.modeling_siglipr7   r8   r9   Ztorch.nnZtorch.nn.functionalZ
functionalr  Ztorch.utils.checkpointr  Zconfiguration_utilsr:   r;   r<   Z
get_loggerre   r   r=   rk   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r	  r  r  r  r  r  r1  r5  r=  r>  rA  r\  r  __all__rS   rS   rS   rd   <module>   s   $$	 
aZm UMD1n  N  .