
    fTh                        S SK r S SKJr  S SKJrJrJrJrJr  S SK	r	S SK	J
r
  SSKJr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.  \&" 5       (       a	  S SK/J
s  J0r1  \'Rd                  " \35      r4\$ " S S\ 5      5       r5\ " S S\5      5       r6\ " S S\5      5       r7\ " S S\5      5       r8 " S S\
Rr                  5      r:S\	Rv                  S\<S \	Rv                  4S! jr= STS"\
Rr                  S#\	Rv                  S$\	Rv                  S%\	Rv                  S&\\	Rv                     S'\>S(\>4S) jjr? " S* S+\
Rr                  5      r@ " S, S-\
Rr                  5      rA " S. S/\5      rB " S0 S1\
Rr                  5      rC\$ " S2 S3\55      5       rD " S4 S5\
Rr                  5      rE " S6 S7\
Rr                  5      rF " S8 S9\
Rr                  5      rG " S: S;\
Rr                  5      rH " S< S=\
Rr                  5      rI " S> S?\
Rr                  5      rJ " S@ SA\
Rr                  5      rK " SB SC\
Rr                  5      rL " SD SE\
Rr                  5      rM\$" SFSG9 " SH SI\55      5       rN " SJ SK\
Rr                  5      rO " SL SM\
Rr                  5      rP\$" SNSG9 " SO SP\55      5       rQ " SQ SR\5\5      rR/ SSQrSg)U    N)	dataclass)CallableListOptionalTupleUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availablelogging	torch_int   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   L    \ rS rSr\rSrSrS/rSS/r	Sr
SrSrSrSrSrS rS	rg
)JanusPreTrainedModel8   modelTLlamaDecoderLayerpast_key_valuescausal_maskFc                    [        U R                  S5      (       a   U R                  R                  R                  OU R                  R                  n[	        U[
        R                  [
        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  [
        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g [	        U[
        R                   5      (       ad  UR                  R                  R                  SUS9  UR"                  b2  UR                  R                  UR"                     R                  5         g g g )Nvision_config        )meanstdg      ?)hasattrconfigr-   initializer_range
isinstancer	   LinearConv2dweightdatanormal_biaszero_	GroupNorm	LayerNormfill_	Embeddingpadding_idx)selfmoduler0   s      `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/janus/modeling_janus.py_init_weights"JanusPreTrainedModel._init_weightsF   sG    t{{O44 KK%%77.. 	
 fryy"))455MM&&CS&9{{&  &&( 'r|| <==KK""$MM$$S)--MM&&CS&9!!-""6#5#56<<> . .     N)__name__
__module____qualname____firstlineno__r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignmentrD   __static_attributes__rG   rF   rC   r&   r&   8   sO    L&*#,-#4m"D!N $ !(-%?rF   r&   c                   d    \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Srg)JanusVQVAEOutputY   a1  
Base class for Janus VQ-VAE mode model outputs.
Args:
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
Ndecoded_pixel_valuesembedding_lossrG   )rH   rI   rJ   rK   __doc__r[   r   torchFloatTensor__annotations__r\   rW   rG   rF   rC   rY   rY   Y   s/     9=(5#4#45<(,NE%%,rF   rY   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
JanusBaseModelOutputWithPasth   at	  
Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_stater*   hidden_states
attentionsimage_hidden_statesrG   )rH   rI   rJ   rK   r]   rd   r   r^   r_   r`   r*   r   re   rf   rg   rW   rG   rF   rC   rb   rb   h   s    "H 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;BrF   rb   c                   "   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)JanusCausalLMOutputWithPast   an  
Base class for Janus causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr*   re   rf   rg   rG   )rH   rI   rJ   rK   r]   rk   r   r^   r_   r`   rl   r*   r   re   r   rf   rg   rW   rG   rF   rC   ri   ri      s    @ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;BrF   ri   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )JanusVisionEmbeddings   r2   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r!   F)
persistent)super__init__r2   hidden_size	embed_dim
image_size
patch_sizer	   r6   num_channelspatch_embeddingnum_patchesnum_positionsr?   position_embeddingregister_bufferr^   arangeexpandrA   r2   	__class__s     rC   r{   JanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]joprF   
embeddingsheightwidthreturnc                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r!   r   rx   g      ?r
   r   bicubicF)sizemodealign_corners)shaper   r7   r^   jit
is_tracingrw   	unsqueezer   r   reshapepermuter	   
functionalinterpolateview)rA   r   r   r   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              rC   interpolate_pos_encoding.JanusVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#NrF   pixel_valuesr   c                 V   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  U R                  XU5      n	OU R                  U R                  5      n	X-   nU$ )N)dtyper   r!   )
r   r   r7   r   toflatten	transposer   r   rw   )
rA   r   r   _r   r   target_dtypepatch_embedsr   
pos_embedss
             rC   forwardJanusVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
#66z5QJ001B1BCJ,
rF   )r2   r}   r~   r   r   r   r   r   F)rH   rI   rJ   rK   r#   r{   r^   Tensorintr   boolr   rW   __classcell__r   s   @rC   rn   rn      sj    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i  rF   rn   re   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)r   r   r   )re   r   batchnum_key_value_headsslenhead_dims         rC   	repeat_kvr   
  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrF   rB   querykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r
   rx   )r   r   )ptrainingr!   )r   num_key_value_groupsr^   matmulr   r   r	   r   softmaxfloat32r   r   r   r   
contiguous)rB   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsr+   attn_outputs                rC   eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$rF   c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                     S\	\R                     S\
\   4S	 jjrS
rU =r$ )JanusVisionAttentioni0  z(Attention Class for Janus Vision Encoderr2   c                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        SU l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  U R                  -  UR"                  S9U l        [        R                   " U R                  U R                  5      U l        US:  a  [        R,                  " U5      O[        R.                  " 5       U l        U(       a   [        R0                  " U R                  5      O[        R.                  " 5       U l        U(       a&  [        R0                  " U R                  5      U l        g [        R.                  " 5       U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr!   r:   r   )rz   r{   r2   r|   r}   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r	   r5   attention_biasq_projk_projv_projprojection_layerDropoutIdentityr=   q_normk_norm)rA   r2   proj_dropoutqk_normr   s       rC   r{   JanusVisionAttention.__init__3  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=rF   re   r   output_attentionsr   c                    UR                  5       u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  SU R
                  U R                  5      nU R                  U5      nU	R	                  SU R
                  U R                  5      n	U R                  U	5      n	UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R                  XVU R
                  U R                  5      R                  SS5      n
[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R!                  S5        O["        U R                  R                     nU" U UU	U
U4U R$                  (       d  S	OU R&                  U R(                  U R*                  S
.UD6u  pUR	                  XVU R,                  5      nU R/                  U5      nU R1                  U5      nU(       a  X4nU$ US 4nU$ )Nrx   r!   r   eagersdpar   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r.   )r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r2   _attn_implementationgetloggerwarning_oncer   r   r   r   r   r}   r   r   )rA   re   r   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputoutputss                   rC   r   JanusVisionAttention.forwardP  s%    "/!3!3!5
Q{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJnn
%
 
%
! "))*t~~N&&{3((0,=6( EKD>rF   )r   r2   r}   r   r   r   r   r   r   r   r   r   r   r   r   )NN)rH   rI   rJ   rK   r]   r#   r{   r^   r   r   r   r   r   rW   r   r   s   @rC   r   r   0  sj    2Q0 Q@ 2648	2||2 !.2 $ELL1	2
 -.2 2rF   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )JanusVisionMLPi  r2   c                    > [         TU ]  5         Xl        [        UR                  UR
                  -  5      U l        [        UR                     U l	        [        R                  " UR                  U R                  5      U l        [        R                  " U R                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g N)rz   r{   r2   r   r|   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr	   r5   fc1fc2r   hidden_dropout_ratedropout1dropout2r   s     rC   r{   JanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>rF   re   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r  r  r  r  r  rA   re   s     rC   r   JanusVisionMLP.forward  sP    /**=9m4/m4rF   )r  r2   r  r  r  r  r   )rH   rI   rJ   rK   r#   r{   r^   r   r   rW   r   r   s   @rC   r   r     s0    ?0 ?U\\ ell  rF   r   c            
          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S\\	   S\
\R                     4S jjrS	rU =r$ )JanusVisionEncoderLayeri  r2   c                 H  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        Xl        g N)eps)rz   r{   r|   r}   r	   r=   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr2   r   s     rC   r{    JanusVisionEncoderLayer.__init__  sr    ++<<F<Q<QR-f5<<F<Q<QR!&)rF   re   r   r   r   c                     UnU R                  U5      nU R                  UUUS9u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)re   r   r   )r  r  r  r  )rA   re   r   r   residualr   r   s          rC   r   JanusVisionEncoderLayer.forward  s      !((7&*nn')/ '5 '
#
 !0 ((7/ 0 "&GrF   )r2   r}   r  r  r  r  r   )rH   rI   rJ   rK   r#   r{   r^   r   r   r   r   r_   r   rW   r   r   s   @rC   r  r    s\    0  -2	$||$ $ $D>	$
 
u  	!$ $rF   r  c            
          ^  \ rS rSrSrS\4U 4S jjr\   SS\\	R                     S\\   S\\   S\4S	 jj5       rS
rU =r$ )JanusVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`JanusVisionEncoderLayer`].

Args:
    config: JanusVisionConfig
r2   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rz   r{   r2   r	   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointingrA   r2   r   r   s      rC   r{   JanusVisionEncoder.__init__  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A&r   r   output_hidden_statesr   c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUnU R                   H-  nU(       a  XW4-   nU" UUUS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XW4-   n[	        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrG   )r   r   r!   )rd   re   rf   )r2   r   r$  r   r   )
rA   inputs_embedsr   r   r$  encoder_statesall_attentionsre   encoder_layerlayer_outputss
             rC   r   JanusVisionEncoder.forward  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[M#!/2B!B)"3M *!,M  !/3C2E!E )  +.>>N+(%
 	
rF   )r2   r!  r   NNN)rH   rI   rJ   rK   r]   r#   r{   r   r   r^   r   r   r   r   rW   r   r   s   @rC   r  r    sm    ,0 ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
rF   r  c                      ^  \ rS rSrSr\rS\4U 4S jjr\     SS\	\
R                     S\	\   S\	\   S\	\   S\S	\\\4   4S
 jj5       rS rSrU =r$ )JanusVisionModeli  r   r2   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r  )rz   r{   r2   r|   rn   r   r  encoderr	   r=   r  post_layernorm	post_init)rA   r2   r}   r   s      rC   r{   JanusVisionModel.__init__  sY     &&	/7)&1 ll9:O:OPrF   r   r$  return_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  UUUUS9nUS   nU R                  U5      nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r&  r   r$  r4  r   r!   )rd   pooler_outputre   rf   )r2   r   r$  use_return_dictr   r   r0  r1  r   re   rf   )
rA   r   r   r$  r4  r   re   encoder_outputsrd   pooled_outputs
             rC   r   JanusVisionModel.forward*  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%58KKK)/')77&11	
 	
rF   c                     U R                   $ r   )r   rA   s    rC   get_input_embeddings%JanusVisionModel.get_input_embeddingsU  s    rF   )r2   r   r0  r1  )NNNNF)rH   rI   rJ   rK   main_input_namer#   rL   r{   r   r   r^   r_   r   r   r   r   r   r=  rW   r   r   s   @rC   r.  r.    s    $O$L	0 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
T rF   r.  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVisionAlignerMLPiY  r2   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf Nr!   )rz   r{   r	   r5   r|   projection_dimr  r  r  depthhidden_layersr   r   r  r"  s      rC   r{   JanusVisionAlignerMLP.__init__Z  s    99V//1F1FG]]NSTUW]WcWcNdeNdRYYv,,f.C.CDNde
 $F$5$56 f   (5Cc                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   r  rF  r  rA   re   layers      rC   r   JanusVisionAlignerMLP.forwardc  B    /''E ..}=M!-0M ( rF   r  r  rF  )	rH   rI   rJ   rK   r#   r{   r   rW   r   r   s   @rC   rA  rA  Y  s    70 7 rF   rA  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S\R                  S\R                  4S	 jrS
rU =r$ )JanusVQVAEVectorQuantizerik  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r2   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        UR                  /S-  U l        g )Nbetag      ?r   )rz   r{   num_embeddingsr}   embedding_dimgetattrrS  r	   r?   	embeddingr   quant_state_dimsr   s     rC   r{   "JanusVQVAEVectorQuantizer.__init__v  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8rF   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r   r
   r!   rx   T)r   keepdimr   z	bd,dn->bn)r   r   r   rU  r^   sumrW  r7   einsumr   argminr   r/   detachrS  )rA   rZ  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrk   s          rC   r   !JanusVQVAEVectorQuantizer.forward  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===rF   image_tokensr   c                 >   UR                   S   nU R                  R                  R                   S   nU R                  U5      n[        R                  " USSS9nUR                  U/U R                  QUP75      nUR                  SSSS5      R                  5       nU$ )Nr   rx   r   )r   r   r
   r!   )	r   rW  r7   F	normalizer   rX  r   r   )rA   rg  r   emb_dimre  s        rC   get_codebook_entry,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!rF   )rS  rW  rU  rT  rX  )rH   rI   rJ   rK   r]   r$   r{   r^   r   r   
LongTensorr_   rl  rW   r   r   s   @rC   rQ  rQ  k  sI    9/ 9>ELL >6"u/?/? "EDUDU " "rF   rQ  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )JanusVQVAEResnetBlocki  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    ư>T
num_groupsr   r  affiner
   r!   rt   ru   rv   r   )rz   r{   rr   rs   use_conv_shortcutr^   r	   r<   norm1r6   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rA   r2   rr   rs   r}  r   s        rC   r{   JanusVQVAEResnetBlock.__init__  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1rF   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r   )ry  r^   sigmoidrz  r{  r   r|  rr   rs   rx  r}  r~  )rA   re   r  s      rC   r   JanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''rF   )
rz  r|  r}  r   rr   r~  ry  r{  rs   rx  r  rH   rI   rJ   rK   r{   r   rW   r   r   s   @rC   rp  rp    s    
 s.( (rF   rp  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEAttnBlocki  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )Nrr  rs  Trt  r!   r   rw  )rz   r{   rr   r^   r	   r<   normr6   qkvproj_outrA   rr   r   s     rC   r{   JanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcderF   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r   r!   r   r]  )r  r  r  r  r   r   r   r^   bmmr   ri  r   r  )rA   re   r  r   r   r   r   channelsr   r   r   r   s               rC   r   JanusVQVAEAttnBlock.forward  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%rF   )rr   r  r  r  r  r  r  r   s   @rC   r  r    s    f& &rF   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvDownsamplei  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr
   r   r   rw  )rz   r{   r	   r6   convr  s     rC   r{   !JanusVQVAEConvDownsample.__init__  s%    IIkAaYZ[	rF   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r!   r   r!   constantr   )padr   r   )ri  r  r  r	  s     rC   r    JanusVQVAEConvDownsample.forward  s+    mJVWX		-0rF   r  r  r   s   @rC   r  r    s    \ rF   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JanusVQVAEConvUpsamplei  c                 l   > [         TU ]  5         [        R                  R	                  XSSSS9U l        g )Nr
   r!   rw  )rz   r{   r^   r	   r6   r  r  s     rC   r{   JanusVQVAEConvUpsample.__init__  s,    HHOOK!TU_`Oa	rF   c                 T    [         R                  " USSS9nU R                  U5      nU$ )Ng       @nearest)scale_factorr   )ri  r   r  r	  s     rC   r   JanusVQVAEConvUpsample.forward
  s(    m#IV		-0rF   r  r  r   s   @rC   r  r    s    b rF   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	JanusVQVAEMidBlocki  r2   r  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        [        UUUS9U l        g )Nr2   rr   rs   )rz   r{   rp  block_1r  attn_1block_2)rA   r2   r  r   s      rC   r{   JanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
rF   re   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r	  s     rC   r   JanusVQVAEMidBlock.forward  s2    ]3M2]3rF   )r  r  r  )rH   rI   rJ   rK   r$   r   r{   r^   r   r   rW   r   r   s   @rC   r  r    s7    
/ 
3 
U\\ ell  rF   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )JanusVQVAEEncoderi&  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  X2SSSS9U l        S[        U5      -   nXpl        [        R                   " 5       U l        [%        U R                  5       GH   n[        R                   " 5       n	[        R                   " 5       n
X'U   -  nX&U   -  n[%        U R
                  5       HM  nU	R'                  [)        UUUS95        UnXR                  S-
  :X  d  M3  U
R'                  [+        U5      5        MO     [        R,                  " 5       nXl        Xl        XR                  S-
  :w  a  [3        U5      Ul        U R"                  R'                  U5        GM     [7        UW5      U l        [        R                  R;                  SUSSS	9U l        [        R                  R                  UU(       a  S
U-  OUSSSS9U l        g )Nr
   r!   rw  )r!   r  rr  rs  Trt  r   ) rz   r{   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrr   double_latentlatent_channelsr^   r	   r6   conv_intuplein_channel_multiplierr  downr  appendrp  r  Moduleblockattnr  
downsampler  midr<   norm_outconv_out)rA   r2   r  rr   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  r   s                  rC   r{   JanusVQVAEEncoder.__init__'  s   "6#<#<=$33,,((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!45)%$,%. %22Q66KK 3H => 6 99;DJI..22":8"DIIT"- 30 &fh7**bxUYbf*g#0Ao ( 
rF   r   c                    U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nrx   r   r!   )r  r  r  r  r  r  r  r  r  r  r  r  r^   r  r  )rA   r   re   r  r  rZ  rd   s          rC   r   JanusVQVAEEncoder.forwardZ  sB   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH%67 !MM*;<U]]+<== MM*;<  rF   )r  r  r  r  r  r  r  r  )
rH   rI   rJ   rK   r{   r^   rn  r   rW   r   r   s   @rC   r  r  &  s     1
f!E$4$4 ! !rF   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )JanusVQVAEDecoderis  c           
      d  > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nX!R                  U R                  S-
     -  n[        R                  R                  X5SSSS9U l        [        X5      U l        [        R                  " 5       U l        [#        [%        U R                  5      5       H  n[        R                  " 5       n[        R                  " 5       nX!R                  U   -  n	[%        U R
                  S-   5       HM  n
UR'                  [)        UUU	S95        U	nX`R                  S-
  :X  d  M3  UR'                  [+        U5      5        MO     [        R,                  " 5       nX{l        Xl        US:w  a  [3        U5      Ul        U R                   R'                  U5        M     [        R                  R7                  SUSSS	9U l        [        R                  R                  XTSSSS9U l        g )
Nr!   r
   rw  r  r   rr  rs  Trt  )rz   r{   r  r  r  r  r  r  rs   r^   r	   r6   r  r  r  r  upreversedr  r  rp  r  r  r  r  r  upsampler<   r  r  )rA   r2   r  r  rs   r  r  r  r  r  r  r  r   s               rC   r{   JanusVQVAEDecoder.__init__t  s   "6#<#<=$33,, 00** !#<#<T=Q=QTU=U#VV xxaXYcde &f7 --/d&:&: ;<GMMOE==?D%(A(A'(JJI !4!4q!89)%$,%. %22Q66KK 3H => : BHG!|4X>GGNN2) =. **bxUYbf*gAVWabcrF   rZ  r   c                 r   U R                  U5      nU R                  U5      n[        U R                  5       H  n[        U R                  S-   5       Ho  nU R
                  U   R                  U   " U5      n[        U R
                  U   R                  5      S:  d  MM  U R
                  U   R                  U   " U5      nMq     X R                  S-
  :w  d  M  U R
                  U   R                  U5      nM     U R                  U5      nU[        R                  " U5      -  nU R                  U5      nU$ )Nr!   r   )r  r  r  r  r  r  r  r  r  r  r  r^   r  r  )rA   rZ  r  r  s       rC   r   JanusVQVAEDecoder.forward  s   ||L1 xx- T112G !4!4q!89#www/55g>|Ltwww',,-1#'777#3#8#8#A,#OL : ..22#www/88F 3 }}\2l33}}\2rF   )r  r  r  r  r  r  r  )
rH   rI   rJ   rK   r{   r^   r_   r   rW   r   r   s   @rC   r  r  s  s.    ,d\E$5$5 %:K:K  rF   r  aG  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                     ^  \ rS rSr\r/ SQrSrS\4U 4S jjrS\	R                  4S jrS\	R                  S\	R                  4S	 jr\\S\	R                  S\\	R                  \	R                  4   4S
 j5       5       rSrU =r$ )
JanusVQVAEi  )r  rp  rQ  r   r2   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         [        U5      U l        SU l        U R#                  5         g )Nr!   F)rz   r{   r  r0  rQ  quantizer^   r	   r6   r  r}   
quant_convpost_quant_convevalr  decoderr!  r2  r   s     rC   r{   JanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	rF   c                 v    U R                  U5      nU R                  U5      nU R                  U5      u  p4nX4U4$ r   )r0  r  r  )rA   r   re   quantemb_lossindicess         rC   encodeJanusVQVAE.encode  s<    \26#'==#? ''rF   rg  r   c                    UR                   S   U R                  R                  S   U R                  R                  S   -  :w  aM  [        SU R                  R                  S   U R                  R                  S   -   SUR                    S35      eU R                  R	                  U5      nU R                  U5      nU R                  U5      nU$ )a  
Decodes quantized token IDs into pixel values.
Args:
    image_tokens (torch.LongTensor): Batch of token IDs.
Returns:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Pixel values decoded from the token IDs.
r!   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   r  rX  r   rl  r  r  )rA   rg  codebook_entryre   r   s        rC   decodeJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2rF   c                     UR                   S   nU R                  U5      u  p4nU R                  UR                  US5      5      n[	        Xd5      nU$ )Nr   rx   )r   r  r  r   rY   )rA   r   r   r  r\   r  r[   r   s           rC   r   JanusVQVAE.forward  sQ     "''*
)-\)B&w#{{7<<
B+GH!"6GrF   )r  r0  r!  r  r  r  )rH   rI   rJ   rK   r$   rL   rO   r?  r{   r^   rn  r  r_   r  r   r   r   r   rW   r   r   s   @rC   r  r    s     $L
 %O/ (5#3#3 (5#3#3 8I8I & 	''	 
u  %"3"33	4	  	rF   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )JanusVQVAEAlignerMLPi  r2   c           	        > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " [        SUR                  5       Vs/ s H.  n[        R                  " UR
                  UR
                  5      PM0     sn5      U l
        [        UR                     U l        g s  snf rC  )rz   r{   r	   r5   r}   rD  r  r  r  r  rF  r   r   r  r"  s      rC   r{   JanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqNpRYYv,,f.C.CDNpq
 $F$5$56 rrH  c                     U R                  U5      nU R                   H  nU R                  U5      nU" U5      nM     U$ r   rJ  rK  s      rC   r   JanusVQVAEAlignerMLP.forward  rN  rF   rO  )	rH   rI   rJ   rK   r$   r{   r   rW   r   r   s   @rC   r  r    s    7/ 7 rF   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	JanusVQVAEHeadi  zOHead used for sampling tokens in image generation, replacing the usual lm head.r2   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        [        R                  " UR
                  UR                  5      U l        g r   )rz   r{   r	   r5   image_token_embed_dimrD  r  r   r   r  rT  vision_headr   s     rC   r{   JanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRrF   re   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r	  s     rC   r   JanusVQVAEHead.forward  s6    m4**=9((7rF   )r  r  r  )rH   rI   rJ   rK   r]   r$   r{   r^   r   tensorr   rW   r   r   s   @rC   r  r    s5    YS/ SU\\ ell  rF   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                   l  ^  \ rS rSrS\4U 4S jjrS rS rS r\	\
           SS\R                  S\R                  S	\\R                     S
\\R                     S\\   S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                  4   4S jj5       5       rSrU =r$ )
JanusModeli  r2   c                   > [         TU ]  U5        Xl        [        R	                  UR
                  5      U l        [        U R                  R                  5      U l        [        R	                  UR                  5      U l        [        R                  " U R                  R                  R                  U R                  R                  R                  5      U l        [#        U R                  R                  5      U l        ['        U R                  R                  5      U l        [*        R,                  " UR.                  S9U l        SU l        U R5                  5         g )N)r2   F)rz   r{   r2   r.  _from_configr-   vision_modelrA  alignerr  	vq_configvqmodelr	   r?   rT  r}   generation_embeddingsr  generation_alignerr  generation_headr    from_configtext_configlanguage_modelr!  r2  r   s     rC   r{   JanusModel.__init__$  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#rF   c                 6    U R                   R                  5       $ r   )r  r=  r<  s    rC   r=  JanusModel.get_input_embeddings9  s    ""7799rF   c                 :    U R                   R                  U5        g r   )r  set_input_embeddingsrA   r   s     rC   r  JanusModel.set_input_embeddings<  s    007rF   c                 ^    U R                  U5      nU R                  UR                  5      nU$ r   )r  r  rd   )rA   r   image_embedss      rC   get_image_featuresJanusModel.get_image_features?  s,    ((6||L$B$BCrF   	input_idsr   r   rw   r*   cache_positionr&  	use_cacher   r$  logits_to_keepc                 r   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R
                  (       a  U(       a  [        R                  S5        SnUb  Ub  [        S5      eUc  U R                  5       " U5      nUb  U R                  U5      nXR                   R                  :H  nUR                  S   nUR                  SU5      nUR                  S5      R                  SSU5      nUR                  UR                   UR"                  5      nUR%                  UU5      nU R&                  " SUUUUUU	U
UUS.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WOS S9nU$ )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either onerx   )	r&  r   rw   r*   r  r   r$  r  r  )rd   r*   re   rf   rg   rG   )r2   r   r$  r   r!  r   r   r   r=  r  image_token_idr   r   r   r   r   devicer   masked_scatterr  rb   rd   r*   re   rf   )rA   r  r   r   rw   r*   r  r&  r  r   r$  r  r   r  image_attention_maskr}   image_features	lm_outputr   s                      rC   r   JanusModel.forwardD  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<s  &&4==##p "	#(Av    557	BM#22<@L#,0J0J#J %++B/I)11"i@N#7#A#A"#E#L#LRQSU^#_ +..}/C/C]EXEXYN)889M~^M'' 
')%+/!5))
 
	 .'99%55#11 ++0<0Hd
 rF   )	r  r2   r  r  r  r!  r  r  r   )NNNNNNNNNNr   )rH   rI   rJ   rK   r"   r{   r=  r  r  r   r   r^   rn  r_   r   r   r   r   r   r   r   rW   r   r   s   @rC   r  r    s*   { *:8
  '+*.1537+/5959$(,0/334H##H ''H !.	H
 u//0H "%H !!1!12H   1 12H D>H $D>H 'tnH c5<</0H  HrF   r  c                     ^  \ rS rSrSS/rSrS\4U 4S jjrS rS r	S	\
R                  S
\
R                  4S jrS rS rS rS r\\            S#S\
R&                  S\
R(                  S\\
R                     S\\
R&                     S\\   S\\
R&                     S\\
R(                     S\\
R&                     S\\   S\\   S\\   S\\\
R                  4   4S jj5       5       r      S$U 4S jjrS\
R                  4S jr\
R:                     S%S	\
R                  S\\
R&                     S \\   4U 4S! jjj5       rS"r U =r!$ )&JanusForConditionalGenerationi  z(model.language_model.embed_tokens.weightzlm_head.weightTr2   c                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  R                  UR                  R                  SS9U l
        U R                  5         g )NFr   )rz   r{   r2   r  r(   r	   r5   r  r|   
vocab_sizelm_headr2  r   s     rC   r{   &JanusForConditionalGeneration.__init__  sZ     '
yy!3!3!?!?ASASA^A^ejk 	rF   c                 J    U R                   R                  R                  5       $ r   )r(   r  r=  r<  s    rC   r=  2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??rF   c                 N    U R                   R                  R                  U5        g r   )r(   r  r  r  s     rC   r  2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=rF   inputsr   c                 r    U R                   R                  U5      nU R                   R                  U5      nU$ r   )r(   r  r  )rA   r(  rZ  s      rC   'prepare_embeddings_for_image_generationEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\BrF   c                     U R                   $ r   r"  r<  s    rC   get_output_embeddings3JanusForConditionalGeneration.get_output_embeddings  s    ||rF   c                     Xl         g r   r-  )rA   new_embeddingss     rC   set_output_embeddings3JanusForConditionalGeneration.set_output_embeddings  s    %rF   c                     Xl         g r   r(   )rA   r  s     rC   set_decoder)JanusForConditionalGeneration.set_decoder  s    
rF   c                     U R                   $ r   r5  r<  s    rC   get_decoder)JanusForConditionalGeneration.get_decoder  s    zzrF   r  r   r   rw   r*   r  r&  labelsr  r   r$  r  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUU	U
UUS.
UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                  UR                   S9nU$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)
r  r   r   rw   r*   r&  r  r   r$  r  )rl   r;  r!  )rk   rl   r*   re   rf   rg   rG   )r2   r   r$  r(   rd   r4   r   slicer"  loss_functionr  r!  ri   r*   re   rf   rg   )rA   r  r   r   rw   r*   r  r&  r;  r  r   r$  r  r   r   re   slice_indicesrl   rk   r   s                       rC   r   %JanusForConditionalGeneration.forward  s'   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ** 
%)%+'/!5)
 
  118B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD,#33!//)) ' ; ;
 rF   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  X)S'   U	$ )N)r*   r&  r   r  r  r   r   )rz   prepare_inputs_for_generation)rA   r  r   r*   r   r&  r  r  r   model_inputsr   s             rC   rB  ;JanusForConditionalGeneration.prepare_inputs_for_generation  sR     w<
+')))
 
 !!+7(rF   rg  c                 x    U R                   R                  R                  U5      nUR                  SSSS5      nU$ )z
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.
Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
r   r   r
   r!   )r(   r   r  r   )rA   rg  decoded_images      rC   decode_image_tokens1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9rF   logits_processorc           	      N  > UR                  SU R                  5      n[        R                  " U5      nUR                  SS5      nUS:X  a  [        T%U ]  " SUUUS S.UD6$ UR                  " S0 UD6nUR                  5       [        R                  [        R                  4;  a  [        S5      eUR                  5         U R                  UR                  5       5        Ub  UO	[        5       nSUS'   UR                  c  [         R#                  S5        S	Ul        UR                  US
'   U R%                  XR&                  U5      u  pnUR(                  UR*                  p[-        UR.                  5      S:w  a  [        SUR.                   S35      eUS LnU R1                  X\UR*                  S9  UR                  (       a;  UR                  S:  a+  UR3                  [5        UR                  5      5        S Ul        U R7                  UUR.                  S   US UUS9nU R8                  " SUUUR:                  S.UD6u  pU R<                  R>                  R@                  RB                  nUR.                  u  pURE                  SS5      nUR                  SS 5      nURE                  SS5      nX'S'   UUS 2S S 24   UR&                  :g  UUS 2S S 24   URF                  S   :g  -  nUUS 2S S 24   RI                  UURJ                  5        U RM                  5       " U5      nU RO                  XU5      nURQ                  SS 5      cB  U RS                  URT                  =(       d    SUS-  [W        URX                  X-   5      UUS9US'   [Z        R\                  " X4XS9nUR^                  nUR`                  nURb                  nURd                  nURf                  nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS nU(       a	  U(       a  SOS n[i        U5       GHy  nU Rj                  " SUUS.UD6nUS   Rm                  UR*                  5      US'   US   Rm                  UR*                  5      US'   U R<                  Rn                  " S0 UDUUS.D6nU Rq                  UU5      nURr                  S S 2SS S 24   Ru                  5       n U R<                  Rw                  U 5      n!U" UU!5      n"URx                  (       a:  [Z        Rz                  " U"SS9n#[Z        R|                  " U#SS9R                  S5      n$O[Z        R                  " U"SS9n$U$US S 2U4'   [Z        R                  " U$U$/5      n$U$R                  S5      n$U R                  U$5      nGM|     U(       aT  U(       a  UW!4-  nU(       a  UW R                  5       4-  nU(       a  UWR                  -  nU(       a  UWR                  -  nU(       a  [        UW!UUUWR                  S9$ U$ ) Ngeneration_configgeneration_modetext)r(  r   rK  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   rN  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r!   )rK  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnrI  r  )r  r   expand_sizer   boi_token_idr*   static)cache_implementationr   max_cache_lenr  model_kwargs)r   r  rG   )r&  r  r  )r   r$  rx   r]  )num_samples)	sequencesscoresrl   rf   re   r*   )IpoprK  copydeepcopyrz   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   rN  r   warning_prepare_model_inputsbos_token_idr   r  r  r   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr(   r  r2   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr=  _get_initial_cache_positionr   
_get_cacherV  max
max_lengthr^   zerosr   r$  output_scoresoutput_logitsreturn_dict_in_generater  rB  r   r  #_update_model_kwargs_for_generationrd   cloner  	do_sampler   multinomialsqueezeargmaxcatr   r*  floatrf   re   r   r*   )&rA   r(  r   rI  r   rK  rL  rX  r  model_input_namer   r  kwargs_has_attention_maskrm  r   r   input_tokensmaskr&  generated_tokensr   r$  rw  rx  ry  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsirC  r   rZ  r[  next_token_scoresprobs
next_tokenr   s&                                        rC   r_  &JanusForConditionalGeneration.generate  sP    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   )//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N22L5
1	\ ")9)9vy1$MiooM^EF  %3$$>!$$%6ZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #'"D"D #
))>>#
 	#
	  ::2299JJ'oo
 ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@P@Z[) /> /L*+ !;;
'EUb .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'(A== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*jj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG )J #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#rF   )r2   r"  r(   )NNNNNNNNNNNr   )NNNNNNr,  )"rH   rI   rJ   rK   _tied_weights_keysrU   r"   r{   r=  r  r^   r   r*  r.  r2  r6  r9  r   r   rn  r_   r   r   r   r   r   r   rB  rG  no_gradr   r_  rW   r   r   s   @rC   r  r    s   DFVW!{ @>ell u|| 
&  '+*.1537+/5959-1$(,0/3349##9 ''9 !.	9
 u//09 "%9 !!1!129   1 129 ))*9 D>9 $D>9 'tn9 c5<</09  9| <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$rF   r  )r&   r  r  r  r.  )r.   )Tr]  dataclassesr   typingr   r   r   r   r   r^   r	   activationsr   cache_utilsr   
generationr   r   r   r   generation.utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor    configuration_janusr"   r#   r$   torch.nn.functionalr   ri  
get_loggerrH   r   r&   rY   rb   ri   r  rn   r   r   r   r  r   r   r   r  r  r.  rA  rQ  rp  r  r  r  r  r  r  r  r  r  r  r  __all__rG   rF   rC   <module>r     s
  ,  ! 9 9   !   u u 9 B 9 X X F &   Q Q ## 
		H	% ?? ? ?@ -{ - - )C; )C )CX &C+ &C &CRHBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4R299 RjRYY (.8 .bM
 M
` ;+ ; ;|BII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH ;% ;;|299 $RYY   
k% k
k\I$$8/ I$X
 trF   