
    fTh                       S SK r S SKJr  S SKJrJrJrJrJrJ	r	J
r
  S SKrS SKrS SKJr  S SKJr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5J6r6J7r7J8r8  SSK9J:r:  SSK;J<r<  SSK=J>r>  SSK?J@r@JArAJBrBJCrCJDrDJErE  SSKFJGrGJHrH  SSKIJJrJ  SSKKJLrL  SSKMJNrNJOrOJPrP  \6" 5       (       a  S SKrS SKQJr  S SKRJs  JSrT  S SKUr\7" 5       (       a  S SKVrVSSKWJXrX  SSK9JYrYJZrZ  \8R                  " \\5      r] " S S\L5      r^ " S  S!\>5      r_ " S" S#\X5      r`\4 " S$ S%\05      5       ra\ " S& S'\-5      5       rb\ " S( S)\G5      5       rc\ " S* S+\H5      5       rd " S, S-\P5      re " S. S/\R                  5      rg " S0 S1\R                  5      rh " S2 S3\O5      ri " S4 S5\N5      rj " S6 S7\<5      rk " S8 S9\R                  5      rl " S: S;\E5      rm " S< S=\D5      rn " S> S?\B5      ro " S@ SA\C5      rp " SB SC\R                  5      rq " SD SE\R                  5      rr " SF SG\A\R                  5      rs " SH SI\R                  5      rt " SJ SK\@5      ru " SL SM\R                  5      rv " SN SO\R                  5      rw\4" SPSQ9 " SR SS\a5      5       rx " ST SU\a\5      ry " SV SW\5      rz/ SXQr{g)Y    N)	dataclass)CallableDictIterableListOptionalTupleUnion)nn)BlipImageProcessor   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)resizeto_channel_dimension_format)ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_array)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availableis_vision_availablelogging   )	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddings)PretrainedConfig)CONFIG_MAPPING
AutoConfigc                   \   ^  \ rS rSrSrSrSr                  SU 4S jjrSrU =r	$ )JanusVisionConfigQ   a^
  
This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
`JanusVisionModel` according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    image_size (`int`, *optional*, defaults to 384):
        The size (resolution) of each image.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for attention weights.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"`, and `"gelu_new"` are supported.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        Ratio of MLP hidden dimensionality to embedding dimensionality.
    attention_bias (`bool`, *optional*, defaults to `True`):
        Whether to add a bias to the queries, keys, and values in the attention layers.
    hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
        The dropout probability for fully connected layers in the encoder.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    projection_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for the projection layer.
    use_qk_norm (`bool`, *optional*, defaults to `False`):
        Whether to normalize the query and key matrices.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated normal initializer for initializing all weight matrices.
    depth (`int`, *optional*, defaults to 2):
        Number of hidden layers in the aligner module.
    num_image_tokens (`int`, *optional*, defaults to 576):
        Number of image tokens.
janus_vision_modelvision_configc                    > [         TU ]  " SUUUUUUUUU	S.	UD6  U ?Xl        Xl        Xl        Xl        Xl        Xl        UU l	        UU l
        UU l        g )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrD   rE   rF   rG   rH   rI   rJ   rK   rL   rQ   rR   rS   rT   rU   rV   rW   rX   rY   kwargs	__class__s                       _/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/janus/modular_janus.pyrO   JanusVisionConfig.__init__   s~    , 	 	
#/ 3%!!/)!	
 	
 "",#6 ,"4&!2
 0    )	rR   rX   rS   rW   rQ   rY   rT   rU   rV   )i         r   ra   i          ư>gelug      @Trb      rb   F{Gz?r*   i@  )
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyrO   __static_attributes____classcell__r\   s   @r]   r?   r?   Q   sW    ,\ &J%O ',1 ,1r_   r?   c                      ^  \ rS rSrSrSSSSSSSS	/ S
QSSSSSSS4S\S\S\S\S\S\S\S\S\\   S\S\4U 4S jjjr	Sr
U =r$ )JanusVQVAEConfig   a	  
This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
`JanusVQVAEModel` according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a
configuration with the defaults will yield a similar configuration to the VQModel of the
[deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

Args:
    embed_dim (`int`, *optional*, defaults to 8):
        Dimensionality of each embedding vector.
    num_embeddings (`int`, *optional*, defaults to 16384):
        Number of codebook embeddings.
    double_latent (`bool`, *optional*, defaults to `False`):
        Whether to use double z channels.
    latent_channels (`int`, *optional*, defaults to 256):
        Number of channels for the latent space.
    num_patches (`int`, *optional*, defaults to 32):
        Num of patches the input images can be divided into.
    in_channels (`int`, *optional*, defaults to 3):
        Number of input channels.
    out_channels (`int`, *optional*, defaults to 3):
        Number of out channels.
    base_channels (`int`, *optional*, defaults to 128):
        Base channel count.
    channel_multiplier (`List[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
        Channel multipliers for each resolution.
    num_res_blocks (`int`, *optional*, defaults to 2):
        Number of residual blocks.
    dropout (`float`, *optional*, defaults to 0.0):
        Dropout rate.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    projection_dim (`int`, *optional*, defaults to 2048):
        Dimensionality of the MLP projection head.
    num_hidden_layers (`int`, *optional*, defaults to 2):
        Number of hidden layers in VAVAE MLP Connecter module.
    hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    image_token_embed_dim (`int`, *optional*, defaults to 2048):
        Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
   i @  F       r      )   rx   r*   r*      r*   rb   rf   re   rd   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                    > [         TU ]  " SUUUUUUU	U
UUS.
UD6  XPl        Xpl        Xl        Xl        Xl        UU l        U ?U ?	U ?
g )N)
rz   r{   r|   r}   r   r   r   r   r   rW   rM   )rN   rO   r~   r   rT   rE   rL   image_token_embed_dim
resolutionattn_resolutions	attn_type)rZ   rz   r{   r|   r}   r~   r   r   r   r   r   r   rW   rT   rE   rL   r   r[   r\   s                     r]   rO   JanusVQVAEConfig.__init__   sv    ( 	 	
)'+#'1)/	
 	
 '(,!2$%:"O!Nr_   )rL   r   rE   r~   r   rT   )rg   rh   ri   rj   rk   intboolr   floatrO   rn   ro   rp   s   @r]   rr   rr      s    *\ ##" (7"#** * 	*
 * * * * * !I* * * *r_   rr   c                   H   ^  \ rS rSrSrSr\\\S.r	    SU 4S jjr
SrU =r$ )JanusConfigi  a  
This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
[deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
        The config object or dictionary of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
        The config object or dictionary of the vision backbone.
    vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
        The config object or dictionary of the VQVAE backbone.
    image_token_id (`int`, *optional*, defaults to 100581):
        Token index of a placeholder image token.

Example:

```python
>>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

>>> # Initializing a Janus vision config
>>> vision_config = JanusVisionConfig()

>>> # Initializing a Llama config
>>> text_config = LlamaConfig()

>>> # Initializing a VQ config
>>> vq_config = JanusVQVAEConfig()

>>> # Initializing a Janus Pro 1B style configuration
>>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

>>> # Initializing a model from the Janus Pro 1B style configuration
>>> model = JanusForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```janus)text_configrB   	vq_configc                   > [        U[        5      (       a-  UR                  SS5      US'   [        US      " S	0 UD6U l        O_Uc)  [
        R                  S5        [        S   " 5       U l        O3[        U[        5      (       a  Xl        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X l        O[        S[        U5       35      eUc%  [
        R                  S5        [        5       U l        OY[        U[        5      (       a  [        S	0 UD6U l        O3[        U[        5      (       a  X0l        O[        S[        U5       35      eU R                  R                  U R                  R                  -  U R                  l        X@l        [$        TU ]L  " S	0 UD6  g )
Nrl   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rM   )
isinstancedictgetr<   r   loggerinfor;   
ValueErrortyper?   rB   rr   r   rI   rH   r~   image_token_idrN   rO   )rZ   r   rB   r   r   r[   r\   s         r]   rO   JanusConfig.__init__A  s    k4(((3g(NK%-k,.GHW;WD KKQR-g68D%566*  $[ 124 
  KKef!2!4Dt,,!2!C]!CD'899!.  $] 346 
 KK`a-/DN	4((-:	:DN	#344&N  $Y02  &*%7%7%B%BdFXFXFcFc%c","6"r_   )r   r   rB   r   )NNNi )rg   rh   ri   rj   rk   rl   r=   r?   rr   sub_configsrO   rn   ro   rp   s   @r]   r   r     s8    +Z J!*%K 5# 5#r_   r   c                   L    \ rS rSr\rSrSrS/rSS/r	Sr
SrSrSrSrSrS rS	rg
)JanusPreTrainedModeliy  modelTLlamaDecoderLayerpast_key_valuescausal_maskFc                    [        U R                  S5      (       a   U R                  R                  R                  OU R                  R                  n[	        U[
        R                  [
        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  [
        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g [	        U[
        R                   5      (       ad  UR                  R                  R                  SUS9  UR"                  b2  UR                  R                  UR"                     R                  5         g g g )NrB   rb   )meanstd      ?)hasattrconfigrB   rW   r   r   LinearConv2dweightdatanormal_biaszero_	GroupNorm	LayerNormfill_	Embeddingpadding_idx)rZ   moduler   s      r]   _init_weights"JanusPreTrainedModel._init_weights  sG    t{{O44 KK%%77.. 	
 fryy"))455MM&&CS&9{{&  &&( 'r|| <==KK""$MM$$S)--MM&&CS&9!!-""6#5#56<<> . .r_   rM   N)rg   rh   ri   rj   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignmentr   rn   rM   r_   r]   r   r   y  sO    L&*#,-#4m"D!N $ !(-%?r_   r   c                   d    \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Srg)JanusVQVAEOutputi  a1  
Base class for Janus VQ-VAE mode model outputs.
Args:
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
Ndecoded_pixel_valuesembedding_lossrM   )rg   rh   ri   rj   rk   r   r   torchFloatTensor__annotations__r   rn   rM   r_   r]   r   r     s/     9=(5#4#45<(,NE%%,r_   r   c                       \ rS rSrSrg)JanusBaseModelOutputWithPasti  rM   Nrg   rh   ri   rj   rn   rM   r_   r]   r   r         r_   r   c                       \ rS rSrSrg)JanusCausalLMOutputWithPasti  rM   Nr   rM   r_   r]   r   r     r   r_   r   c                   V    \ rS rSrSS\R
                  S\S\R
                  4S jjrSrg)	JanusVisionEmbeddingsi  pixel_valuesinterpolate_pos_encodingreturnc                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )Ndtyper*   rx   )
shapepatch_embeddingr   r   toflatten	transposer   position_embeddingposition_ids)
rZ   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             r]   forwardJanusVisionEmbeddings.forward  s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
r_   rM   N)F)	rg   rh   ri   rj   r   Tensorr   r   rn   rM   r_   r]   r   r     s,    ELL D ]b]i]i  r_   r   c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                     S\	\R                     S\
\   4S	 jjrS
rU =r$ )JanusVisionAttentioni  z(Attention Class for Janus Vision Encoderr   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frx   r   r   )rN   rO   r   rD   rz   rF   	num_headshead_dimr   scalerJ   rU   rV   	is_causalnum_key_value_groupsr   r   rR   q_projk_projv_projprojection_layerDropoutIdentityr   q_normk_norm)rZ   r   proj_dropoutqk_normr\   s       r]   rO   JanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r_   hidden_statesattention_maskoutput_attentionsr[   c                    UR                  5       u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  SU R
                  U R                  5      nU R                  U5      nU	R	                  SU R
                  U R                  5      n	U R                  U	5      n	UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R                  XVU R
                  U R                  5      R                  SS5      n
[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R!                  S5        O["        U R                  R                     nU" U UU	U
U4U R$                  (       d  S	OU R&                  U R(                  U R*                  S
.UD6u  pUR	                  XVU R,                  5      nU R/                  U5      nU R1                  U5      nU(       a  X4nU$ US 4nU$ )Nrx   r*   eagersdpar  Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rb   )r   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr6   r   _attn_implementationr   r   warning_oncer"   trainingrJ   r   r   rz   r   rU   )rZ   r   r   r  r[   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputoutputss                   r]   r   JanusVisionAttention.forward  s%    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0,=6( EKD>r_   )rJ   r   rz   r   r   r   r   r   r   rU   r   r   r   r   r   )NN)rg   rh   ri   rj   rk   r?   rO   r   r   r   r$   r    r   rn   ro   rp   s   @r]   r   r     sj    2Q0 Q@ 2648	2||2 !.2 $ELL1	2
 -.2 2r_   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPi  r   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g N)rN   rO   r   r   rD   rQ   rP   r   rL   activation_fnr   r   fc1fc2r   rS   dropout1dropout2rZ   r   r\   s     r]   rO   JanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r_   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r  )r  r  r  r  r   rZ   r   s     r]   r   JanusVisionMLP.forward$  sP    /**=9m4/m4r_   )r  r   r  r   r  r  rP   )rg   rh   ri   rj   r?   rO   r   r   r   rn   ro   rp   s   @r]   r  r    s0    ?0 ?U\\ ell  r_   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionEncoderLayeri-  r   c                 H  > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " U R                  UR                  S9U l
        [        R                  " U R                  UR                  S9U l        [        U5      U l        g )N)eps)rN   rO   r   rD   rz   r   	self_attnr   r   rK   layer_norm1layer_norm2r  mlpr!  s     r]   rO    JanusVisionEncoderLayer.__init__.  sr    ++-f5<<F<Q<QR<<F<Q<QR!&)r_   )r   rz   r+  r,  r-  r*  rg   rh   ri   rj   r?   rO   rn   ro   rp   s   @r]   r'  r'  -  s    *0 * *r_   r'  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionEncoderi8  r   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r  )rN   rO   r   
ModuleListrangerE   r'  layersrZ   r   r   r\   s      r]   rO   JanusVisionEncoder.__init__9  sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   A)r5  r/  rp   s   @r]   r1  r1  8  s    p0 p pr_   r1  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )JanusVisionModeli>  r   c                 D   > [         TU ]  U5        [        U5      U l        g r  )rN   rO   r1  encoderr!  s     r]   rO   JanusVisionModel.__init__?  s     )&1r_   )r;  r/  rp   s   @r]   r9  r9  >  s    20 2 2r_   r9  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPiD  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nrx   )rN   rO   r   r   rD   rT   r  r3  r4  rX   hidden_layersr   rL   r  r6  s      r]   rO   JanusVisionAlignerMLP.__init__E  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  r  rA  r  rZ   r   layers      r]   r   JanusVisionAlignerMLP.forwardN  B    /''E ..}=M!-0M ( r_   r  r  rA  )	rg   rh   ri   rj   r?   rO   r   rn   ro   rp   s   @r]   r>  r>  D  s    70 7 r_   r>  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )JanusVQVAEVectorQuantizeriV  r   c                 N   > [         TU ]  U5        UR                  /S-  U l        g )Nr*   )rN   rO   r~   quant_state_dimsr!  s     r]   rO   "JanusVQVAEVectorQuantizer.__init__W  s&     !'!3!3 4q 8r_   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   r  r*   )pdimr   rx   )	r   	embeddingr   F	normalizer	  rN  permute
contiguous)rZ   rP  r  emb_dimhidden_state_quants        r]   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entry[  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r_   )rN  )rg   rh   ri   rj   rr   rO   r   
LongTensorr   r[  rn   ro   rp   s   @r]   rL  rL  V  s4    9/ 9"u/?/? "EDUDU " "r_   rL  c                       \ rS rSrSrg)JanusVQVAEResnetBlockik  rM   Nr   rM   r_   r]   r_  r_  k      r_   r_  c                       \ rS rSrSrg)JanusVQVAEAttnBlockio  rM   Nr   rM   r_   r]   rb  rb  o  r`  r_   rb  c                       \ rS rSrSrg)JanusVQVAEConvDownsampleis  rM   Nr   rM   r_   r]   rd  rd  s  r`  r_   rd  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsampleiw  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr   rx   kernel_sizestridepadding)rN   rO   r   r   r   conv)rZ   r   r\   s     r]   rO   JanusVQVAEConvUpsample.__init__x  s,    HHOOK!TU_`Oa	r_   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factormode)rU  interpolaterl  r$  s     r]   r   JanusVQVAEConvUpsample.forward|  s(    m#IV		-0r_   )rl  )rg   rh   ri   rj   rO   r   rn   ro   rp   s   @r]   rf  rf  w  s    b r_   rf  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r   channelsc                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr   r   r   )rN   rO   r_  block_1rb  attn_1block_2)rZ   r   rv  r\   s      r]   rO   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r_   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )ry  rz  r{  r$  s     r]   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3r_   )rz  ry  r{  )rg   rh   ri   rj   rr   r   rO   r   r   r   rn   ro   rp   s   @r]   ru  ru    s7    
/ 
3 
U\\ ell  r_   ru  c                   <    \ rS rSrS rS\R                  4S jrSrg)JanusVQVAEEncoderi  c           
         [         R                  R                  5         [        UR                  5      U l        UR                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                   R                  X2SSSS9U l        S[        U5      -   nXpl        [         R                   " 5       U l        [%        U R
                  5       GH   n[         R                   " 5       n	[         R                   " 5       n
X'U   -  nX&U   -  n[%        U R                  5       HM  nU	R'                  [)        UUUS95        UnXR
                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [         R                  " 5       nXl        Xl        XR
                  S-
  :w  a  [1        U5      Ul        U R"                  R'                  U5        GM     [5        UW5      U l        [        R                   R9                  SUSSS	9U l        [        R                   R                  UU(       a  S
U-  OUSSSS9U l        g )Nr   rx   rh  )rx   rx  rv   rc   T
num_groupsrG   r)  affiner*   )r   ModulerO   lenr   num_resolutionsr   r   r   r|   r}   r   r   conv_intuplein_channel_multiplierr3  downr4  appendr_  rb  blockattnrd  
downsampleru  midr   norm_outconv_out)rZ   r   r   r   r|   r}   r   r  i_levelr  r  block_in	block_outi_blockr  s                  r]   rO   JanusVQVAEEncoder.__init__  s   
		"6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
r_   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr  r   rx   )r  r4  r  r   r  r  r  r  r  r  r  r  r   sigmoidr  )rZ   r   r   r  r  hidden_statelast_hidden_states          r]   r   JanusVQVAEEncoder.forward  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  r_   )r  r  r  r  r  r  r   r  N)	rg   rh   ri   rj   rO   r   r]  r   rn   rM   r_   r]   r  r    s    1
f!E$4$4 !r_   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderi  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nrx   r   rh  rx  r   rv   rc   Tr  )rN   rO   r  r   r  r   r   r}   r   r   r   r   r  ru  r  r3  upreversedr4  r  r_  rb  r  r  r  rf  upsampler   r  r  )rZ   r   r   r}   r   r  r  r  r  r  r  r  r\   s               r]   rO   JanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcr_   r  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nrx   r   )r  r  r4  r  r   r  r  r  r  r  r  r   r  r  )rZ   r  r  r  s       r]   r   JanusVQVAEDecoder.forward  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2r_   )r  r  r  r  r   r  r  )
rg   rh   ri   rj   rO   r   r   r   rn   ro   rp   s   @r]   r  r    s.    ,d\E$5$5 %:K:K  r_   r  c                      ^  \ rS rSr/ SQrSrS\4U 4S jjrS\R                  S\R                  4S jr\\S\R                  S\\R                  \R                  4   4S	 j5       5       rS
rU =r$ )
JanusVQVAEi)  )rb  r_  rL  r   r   c                 r   > [         TU ]  U5        [        U5      U l        SU l        U R                  5         g )NF)rN   rO   r  decodergradient_checkpointing	post_initr!  s     r]   rO   JanusVQVAE.__init__1  s0     (0&+# 	r_   rP  r   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
rx   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizerN  r   r[  post_quant_convr  )rZ   rP  codebook_entryr   r   s        r]   decodeJanusVQVAE.decode9  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r_   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      nU$ )Nr   r  )r   encoder  r	  r   )rZ   r   r  quantr   indicesr   r  s           r]   r   JanusVQVAE.forwardL  sQ     "''*
)-\)B&w#{{7<<
B+GH!"6Gr_   )r  r  )rg   rh   ri   rj   r   main_input_namerr   rO   r   r]  r   r  r&   r%   r	   r   rn   ro   rp   s   @r]   r  r  )  s    
 %O/ 5#3#3 8I8I & 	''	 
u  %"3"33	4	  	r_   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPiZ  r   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf r@  )rN   rO   r   r   rz   rT   r  r3  r4  rE   rA  r   rL   r  r6  s      r]   rO   JanusVQVAEAlignerMLP.__init__[  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rrC  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r  rE  rF  s      r]   r   JanusVQVAEAlignerMLP.forwardd  rI  r_   rJ  )	rg   rh   ri   rj   rr   rO   r   rn   ro   rp   s   @r]   r  r  Z  s    7/ 7 r_   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadil  zOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r  )rN   rO   r   r   r   rT   proj_outr   rL   r  r{   vision_headr!  s     r]   rO   JanusVQVAEHead.__init__o  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr_   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  r$  s     r]   r   JanusVQVAEHead.forwardu  s6    m4**=9((7r_   )r  r  r  )rg   rh   ri   rj   rk   rr   rO   r   r   tensorr   rn   ro   rp   s   @r]   r  r  l  s5    YS/ SU\\ ell  r_   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    )custom_introc                   l  ^  \ rS rSrS\4U 4S jjrS rS rS r\	\
           SS\R                  S\R                  S	\\R                     S
\\R                     S\\   S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                  4   4S jj5       5       rSrU =r$ )
JanusModeli|  r   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r   F)rN   rO   r   r9  _from_configrB   vision_modelr>  alignerr  r   vqmodelr   r   r{   rz   generation_embeddingsr  generation_alignerr  generation_headr+   from_configr   language_modelr  r  r!  s     r]   rO   JanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r_   c                 6    U R                   R                  5       $ r  )r  get_input_embeddingsrZ   s    r]   r  JanusModel.get_input_embeddings  s    ""7799r_   c                 :    U R                   R                  U5        g r  )r  set_input_embeddingsrZ   values     r]   r  JanusModel.set_input_embeddings  s    007r_   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r  )r  r  r  )rZ   r   image_embedss      r]   get_image_featuresJanusModel.get_image_features  s,    ((6||L$B$BCr_   	input_idsr   r   r   r   cache_positioninputs_embeds	use_cacher  output_hidden_stateslogits_to_keepc                 r   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R
                  (       a  U(       a  [        R                  S5        SnUb  Ub  [        S5      eUc  U R                  5       " U5      nUb  U R                  U5      nXR                   R                  :H  nUR                  S   nUR                  SU5      nUR                  S5      R                  SSU5      nUR                  UR                   UR"                  5      nUR%                  UU5      nU R&                  " SUUUUUU	U
UUS.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WOS S9nU$ )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oner  )	r  r   r   r   r  r  r  r  r  )r  r   r   
attentionsimage_hidden_statesrM   )r   r  r  r   r  r  r   r  r  r  r   r   r  	unsqueezeexpandr   devicer   masked_scatterr  r   r  r   r   r  )rZ   r  r   r   r   r   r  r  r  r  r  r  r[   r  image_attention_maskrz   image_features	lm_outputr  s                      r]   r   JanusModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<s  &&4==##p "	#(Av    557	BM#22<@L#,0J0J#J %++B/I)11"i@N#7#A#A"#E#L#LRQSU^#_ +..}/C/C]EXEXYN)889M~^M'' 
')%+/!5))
 
	 .'99%55#11 ++0<0Hd
 r_   )	r  r   r  r  r  r  r  r  r  )NNNNNNNNNNr   )rg   rh   ri   rj   r   rO   r  r  r  r&   r%   r   r]  r   r   r   r   r   r
   r   r   rn   ro   rp   s   @r]   r  r  |  s*   { *:8
  '+*.1537+/5959$(,0/334H##H ''H !.	H
 u//0H "%H !!1!12H   1 12H D>H $D>H 'tnH c5<</0H  Hr_   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jrS rS rS rS r\\            S#S\
R&                  S\
R(                  S\\
R                     S\\
R&                     S\\   S\\
R&                     S\\
R(                     S\\
R&                     S\\   S\\   S\\   S\\\
R                  4   4S jj5       5       r      S$U 4S jjrS\
R                  4S jr\
R:                     S%S	\
R                  S\\
R&                     S \\   4U 4S! jjj5       rS"r U =r!$ )&JanusForConditionalGenerationi  z(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )rN   rO   r   r  r   r   r   r   rD   
vocab_sizelm_headr  r!  s     r]   rO   &JanusForConditionalGeneration.__init__  sZ     '
yy!3!3!?!?ASASA^A^ejk 	r_   c                 J    U R                   R                  R                  5       $ r  )r   r  r  r  s    r]   r  2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??r_   c                 N    U R                   R                  R                  U5        g r  )r   r  r  r  s     r]   r  2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=r_   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r  )r   r  r  )rZ   r  r  s      r]   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Br_   c                     U R                   $ r  r  r  s    r]   get_output_embeddings3JanusForConditionalGeneration.get_output_embeddings  s    ||r_   c                     Xl         g r  r  )rZ   new_embeddingss     r]   set_output_embeddings3JanusForConditionalGeneration.set_output_embeddings
  s    %r_   c                     Xl         g r  r   )rZ   r  s     r]   set_decoder)JanusForConditionalGeneration.set_decoder  s    
r_   c                     U R                   $ r  r  r  s    r]   get_decoder)JanusForConditionalGeneration.get_decoder  s    zzr_   r  r   r   r   r   r  r  labelsr  r  r  r  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUU	U
UUS.
UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                  UR                   S9nU$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)
r  r   r   r   r   r  r  r  r  r  )logitsr  r  )lossr  r   r   r  r  rM   )r   r  r  r   r  r   r   slicer  loss_functionr   r  r   r   r   r  r  )rZ   r  r   r   r   r   r  r  r  r  r  r  r  r[   r  r   slice_indicesr  r  r  s                       r]   r   %JanusForConditionalGeneration.forward  s'   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ** 
%)%+'/!5)
 
  118B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD,#33!//)) ' ; ;
 r_   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r   r  r   r  r  r   r   )rN   prepare_inputs_for_generation)rZ   r  r   r   r   r  r  r  r[   model_inputsr\   s             r]   r  ;JanusForConditionalGeneration.prepare_inputs_for_generationP  sR     w<
+')))
 
 !!+7(r_   rP  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r*   r   rx   )r   r  r  rW  )rZ   rP  decoded_images      r]   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokensn  s:     

**11,?%--aAq9r_   logits_processorc           	      N  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cB  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      UUS9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r  r   r#  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r&  r*   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rx   )r#  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr!  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr  max_cache_lenr  model_kwargs)r   r  rM   )r  r  r  )r  r  r  )rS  )num_samples)	sequencesscoresr  r  r   r   )Ipopr#  copydeepcopyrN   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r&  r   warning_prepare_model_inputsbos_token_idr   r  r  r   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rY   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   
_get_cacher.  max
max_lengthr   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater4  r  r   r  #_update_model_kwargs_for_generationr  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r   r   r  r   r   r   )&rZ   r  r   r!  r[   r#  r$  r0  r  model_input_namer   r  kwargs_has_attention_maskrY   r  r  input_tokensmaskr  generated_tokensr  r  rN  rO  rP  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r3  next_token_scoresprobs
next_tokenr\   s&                                        r]   r7  &JanusForConditionalGeneration.generatez  sP    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r_   )r   r  r   )NNNNNNNNNNNr   )NNNNNN)NNN)"rg   rh   ri   rj   _tied_weights_keysr   r   rO   r  r  r   r   r   r  r  r  r  r&   r%   r]  r   r   r   r   r
   r   r   r  r  no_gradr   r7  rn   ro   rp   s   @r]   r  r    s   DFVW!{ @>ell u|| 
&  '+*.1537+/5959-1$(,0/3349##9 ''9 !.	9
 u//09 "%9 !!1!129   1 129 ))*9 D>9 $D>9 'tn9 c5<</09  9| <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$r_   r  c                     ^  \ rS rSrSrSSS\R                  SSSSSS4
S\S\\	\
\4      S	\S
\S\S\\\4   S\S\\\\\   4      S\\\\\   4      S\\   4U 4S jjjr   SS\R"                  S\\\\\\4   4   S\\\
\4      S\\\
\4      S\R(                  4
S jjr\R                  SS4S\R"                  S\\	\
\4   \4   S
\S\\\
\4      S\\\
\4      S\R"                  4S jjr       SS\S\\   S\\   S\\   S\\\      S\\\      S\\
   S\\
   4S jjr S S\R(                  S\\\\   4   S\\\\   4   S\\\
\4      S\R(                  4
S jjrSrU =r$ )!JanusImageProcessori;  a
  
Constructs a JANUS image processor.

Args:
    do_resize (`bool`, *optional*, defaults to `True`):
        Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
        `do_resize` parameter in the `preprocess` method.
    size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
        Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
        method.
    min_size (`int`, *optional*, defaults to 14):
        The minimum allowed size for the resized image. Ensures that neither the height nor width
        falls below this value after resizing.
    resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
        overridden by the `resample` parameter in the `preprocess` method.
    do_rescale (`bool`, *optional*, defaults to `True`):
        Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
        `do_rescale` parameter in the `preprocess` method.
    rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
        Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
        overridden by the `rescale_factor` parameter in the `preprocess` method.
    do_normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
        method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
    image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
        Mean to use if normalizing the image. This is a float or list of floats the length of the number of
        channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
        overridden by the `image_mean` parameter in the `preprocess` method.
    image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
        Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
        number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        Can be overridden by the `image_std` parameter in the `preprocess` method.
    do_convert_rgb (`bool`, *optional*, defaults to `True`):
        Whether to convert the image to RGB.
TN   gp?	do_resizer  min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbc           	         > [         TU ]  " S0 UD6  X0l        Uc  SU l        g [	        U Vs/ s H  n[        US-  5      PM     sn5      U l        g s  snf )N)   rv  rv     rM   )rN   rO   rm  background_colorr  r   )rZ   rl  r  rm  rn  ro  rp  rq  rr  rs  rt  r[   xr\   s                r]   rO   JanusImageProcessor.__init__a  sO     	"6" $3D!$)*LA3q3w<*L$MD!*Ls   Aimagerx  data_formatinput_data_formatr   c                 6   [        X5      u  pVU[        R                  :X  a  UR                  S   OUR                  S   nXV:X  a  Ub  [	        XU5      nU$ UnU$ [        XV5      n[        U[        5      (       a  U/nO[        U5      U:w  a  [        SU S35      eU[        R                  :X  av  [        R                  " XxU4UR                  S9n	[        U5       H  u  pXU
SS2SS24'   M     Xe:  a  X-
  S-  nXSS2XU-   2SS24'   U	$ X-
  S-  nXSS2SS2XU-   24'    U	$ [        R                  " XU4UR                  S9n	[        U5       H  u  pXSS2SS2U
4'   M     Xe:  a  X-
  S-  nXXU-   2SS2SS24'   U	$ X-
  S-  nXSS2XU-   2SS24'   U	$ )a  
Pads an image to a square based on the longest edge.

Args:
    image (`np.ndarray`):
        The image to pad.
    background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
        The color to use for the padding. Can be an integer for single channel or a
        tuple of integers representing for multi-channel images. If passed as integer
        in mutli-channel mode, it will default to `0` in subsequent channels.
    data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the output image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        If unset, will use same as the input image.
    input_data_format (`str` or `ChannelDimension`, *optional*):
        The channel dimension format for the input image. Can be one of:
            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

Returns:
    `np.ndarray`: The padded image.
r   r  Nz(background_color must have no more than z) elements to match the number of channelsr   r*   )r   r   FIRSTr   r   rK  r   r   r  r   nprM  r   	enumerate)rZ   r{  rx  r|  r}  r   r   rG   max_dimresultrb  colorstarts                r]   pad_to_square!JanusImageProcessor.pad_to_squarew  s   < 'u@):>N>T>T)Tu{{1~Z_ZeZefhZi? * ,E@QR 
 L  
 Lf$ &,, 01!"l2:<.Hqr   0 6 66XX|g>ekkRF%&67"'q!Qw 8~ )a/7<q%&.0!34  !Q.6;q!UU]223  XXw>ekkRF%&67"'q!Qw 8~ )a/7<uv~-q!34
  !Q.6;q%%-/23r_   c                    Uc  [        U5      n[        X5      u  px[        Xx5      n	[        USS9nUS   US   :w  a  [	        SUS    SUS    35      eUS   nX)-  n
[        [        Xz-  5      U R                  5      [        [        X-  5      U R                  5      /n[        U4UUUUS.UD6nU R                  UU R                  US9nU$ )	a$  
Resize an image to dynamically calculated size.

Args:
    image (`np.ndarray`):
        Image to resize.
    resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
        `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
    data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the output image. If unset, the channel dimension format of the input
        image is used. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `None`: will be inferred from input
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

Returns:
    `np.ndarray`: The resized image.
T)default_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r  rn  r|  r}  )r{  rx  r}  )
r   r   rK  r   r   r   rm  r   r  rx  )rZ   r{  r  rn  r|  r}  r[   r   r   max_sizedeltaoutput_size_nonpaddeds               r]   r   JanusImageProcessor.resize  s   B $ >u E&u@v%TT:>T']*GXGWWbcghocpbqr  H~ FN#T]]3EM"DMM2!

 
&#/
 
 ""!22/ # 

 r_   imagesreturn_tensorsc	                 <   Ub  UOU R                   nUc  SU R                  -  OUnUb  UOU R                  nUb  UOU R                  nUb  UOU R                  n[        U5      n[        US   [        R                  R                  5      (       a  [        U5      S:  a  U$ US   $ Uc  [        US   5      n/ n	U H  n
[        U
5      n
U(       a  U R                  XXgS9n
U(       a?  U R                  XUS9n
U
R                  SS5      R                  [         R"                  5      n
U(       aE  U(       a>  US:X  a8  [%        U
[&        R(                  US	9n
[        R                  R+                  U
5      n
U	R-                  U
5        M     S
U	0nUS:w  a  UOSn[/        XS9$ )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Nr   r   rx   )r{  rr  rs  r}  )r   r}  rw  zPIL.Image.Image)input_channel_dimr   )r   tensor_type)ro  rp  rq  rr  rs  r   r   PILImager  r   r   unnormalizerescaleclipastyper  uint8r   r   LAST	fromarrayr  r   )rZ   r  ro  rp  rq  rr  rs  r}  r  r   r{  r   s               r]   postprocessJanusImageProcessor.postprocess  s    $.#9Zt
6D6Lt222R`'3'?|TEVEV#-#9Zt
!*!6IDNN	$V,fQi11 [1_6;&);$ >vay IE"5)E(() )  UTef

1c*11"((;
~AR/R3E;K;P;Pduv		++E2&! $ -+9=N+NTXBBr_   c                    Sn[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[        U[        5      (       a*  [        U5      U:w  a  [        SU S[        U5       35      eOU/U-  n[	        S [        X#5       5       5      n[	        S U 5       5      nU R                  XXtS9nU$ )a  
Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
image = (image * image_std) + image_mean
Args:
    image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
        Batch of pixel values to postprocess.
    image_mean (`float` or `Iterable[float]`):
        The mean to use for unnormalization.
    image_std (`float` or `Iterable[float]`):
        The standard deviation to use for unnormalization.
    input_data_format (`ChannelDimension` or `str`, *optional*):
        The channel dimension format for the input image. If unset, the channel dimension format is inferred
        from the input image. Can be one of:
        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
r   zmean must have z$ elements if it is an iterable, got zstd must have c              3   2   #    U  H  u  pU* U-  v   M     g 7fr  rM   ).0r   r   s      r]   	<genexpr>2JanusImageProcessor.unnormalize.<locals>.<genexpr>_  s     W<Vytus{<Vs   c              3   ,   #    U  H
  nS U-  v   M     g7f)rx   NrM   )r  r   s     r]   r  r  `  s     ;#a#gs   )r{  r   r   r}  )r   r   r  r   r  ziprV  )rZ   r{  rr  rs  r}  rG   rev_image_meanrev_image_stds           r]   r  JanusImageProcessor.unnormalize9  s    0 j(++:,. ?<.@dehisetdu!vww / %4Ji**9~- >,?cdghqdrcs!tuu . #l2IWC
<VWW;;;-  
 r_   )rx  rm  )r   NN)NNNNNNNr  )rg   rh   ri   rj   rk   r   BICUBICr   r   r   strr   r
   r   r   rO   r  ndarrayr	   r   arrayr  r   r   r  r   r  rn   ro   rp   s   @r]   rj  rj  ;  s   #N )-'9'A'A,3!:>9=)-NN tCH~&N 	N
 %N N c5j)N N U5$u+#567N E%e"456N !N N2 >?>BDHHzzH  U3S=%9 9:H eC)9$9:;	H
 $E#/?*?$@AH 
H\ (:'A'A>BDHCzzC DcNC'(C %	C
 eC)9$9:;C $E#/?*?$@AC 
CP &**.'+,0+/+/(,1C1C TN1C !	1C
 tn1C T%[)1C DK(1C $C=1C !1Cp EI+xx+ %%01+ /0	+
 $E#/?*?$@A+ 
+ +r_   rj  )	rj  r   r  r  r  r9  rr   r?   r   )|r5  dataclassesr   typingr   r   r   r   r   r	   r
   numpyr  r   r   .transformers.models.blip.image_processing_blipr   activationsr   cache_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   image_utilsr   r   r   r   r   r   r   modeling_flash_attention_utilsr    modeling_outputsr!   modeling_utilsr"   r#   processing_utilsr$   utilsr%   r&   r'   r(   r)   autor+   blip_2.modeling_blip_2r,   !chameleon.configuration_chameleonr-   chameleon.modeling_chameleonr.   r/   r0   r1   r2   r3   idefics.modeling_ideficsr4   r5   llama.modeling_llamar6   siglip.configuration_siglipr7   siglip.modeling_siglipr8   r9   r:   torch.nntorch.nn.functional
functionalrU  torch.utils.checkpointr  configuration_utilsr;   r<   r=   
get_loggerrg   r   r?   rr   r   r   r   r   r   r   r  r   r  r'  r1  r9  r>  rL  r_  rb  rd  rf  ru  r  r  r  r  r  r  r  rj  __all__rM   r_   r]   <module>r     s     ! I I I    M !   u u 9 A C   C + F & g g  5 D  e : < ^ ^ ##! 3 - 
		H	%
^1* ^1BW+ Wtj#" j#Z ?? ? ?@ -{ - - 	#A 	 	 	"? 	 	2 "R299 RjRYY (*0 *p p2' 2BII $" = "*	< 		8 		B 	RYY  ,J!-ryy J!ZA		 AH. .b299 $RYY   
k% k
k\I$$8/ I$X
i, iX	
r_   