
    fThl7                       S SK r S SKJr  S SKJrJrJrJrJr  S SK	r	S SK
Jr  S SKJs  Jr  SSKJr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0J1r1J2r2  \-" 5       (       a  S SK3J4r4  SSK5J6r6  \.Rn                  " \85      r9\" S5       " S S\Rt                  5      5       r; " S S\Rt                  5      r<S r=SaS jr>S\	R~                  S\@S\	R~                  4S jrA SbS \Rt                  S!\	R~                  S"\	R~                  S#\	R~                  S$\\	R~                     S%\BS&\B4S' jjrC " S( S)\Rt                  5      rD " S* S+\5      rE " S, S-\Rt                  5      rF " S. S/\Rt                  5      rG " S0 S1\Rt                  5      rH " S2 S3\Rt                  5      rI " S4 S5\Rt                  5      rJ " S6 S7\Rt                  5      rK " S8 S9\Rt                  5      rL " S: S;\Rt                  5      rM " S< S=\Rt                  5      rN " S> S?\Rt                  5      rO " S@ SA\R                  5      rQ " SB SC\Rt                  5      rR " SD SE\Rt                  5      rS " SF SG\Rt                  5      rT " SH SI\Rt                  5      rU " SJ SK\Rt                  5      rV\+" SLSM9 " SN SO\&5      5       rW " SP SQ5      rX\+ " SR SS\&5      5       rY " ST SU\Rt                  5      rZ\+ " SV SW\Y5      5       r[ " SX SY\\*5      r\\+ " SZ S[\Y\5      5       r] " S\ S]\Y5      r^ " S^ S_\Y\5      r_/ S`Qr`g)c    N)cached_property)CallableListOptionalTupleUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfig)	BlockMask)make_flex_block_causal_maskRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Emu3RMSNorm7   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z*
Emu3RMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/emu3/modeling_emu3.pyr*   Emu3RMSNorm.__init__9   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor-   float32powmeanrsqrtr0   r/   )r1   hidden_statesinput_dtypevariances       r5   forwardEmu3RMSNorm.forwardA   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r7   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler/   shaper0   r1   s    r5   
extra_reprEmu3RMSNorm.extra_reprH   s*    ))*+6$2G2G1HIIr7   )r0   r/   )ư>)	__name__
__module____qualname____firstlineno__r*   rE   rK   __static_attributes____classcell__r4   s   @r5   r&   r&   7   s    $;J Jr7   r&   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3MLPL   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)r)   r*   configr2   intermediate_sizer+   Linearmlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnr1   r[   r4   s     r5   r*   Emu3MLP.__init__M   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r7   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)ra   rc   r_   r`   )r1   xra   s      r5   rE   Emu3MLP.forwardW   s6    NN4;;t~~a/@#ADLLQRO#ST	r7   )rc   r[   ra   r_   r2   r\   r`   rN   rO   rP   rQ   r*   rE   rR   rS   rT   s   @r5   rV   rV   L   s    0 r7   rV   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr:   r9   dim)rI   r-   cat)rh   x1x2s      r5   rotate_halfrq   \   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerq   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr|   c   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr7   rB   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rI   expandreshape)rB   r}   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvr   ~   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr7   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr9   r	   r:   )rm   r<   )ptrainingr   )r   num_key_value_groupsr-   matmul	transposerI   r+   
functionalsoftmaxr>   r=   r<   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r5   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r7   c                   F  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )Emu3Attention   =Multi-headed attention from 'Attention Is All You Need' paperr[   	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )Nr         TrY   )r)   r*   r[   r   getattrr2   num_attention_headsr   r   r   r   attention_dropout	is_causalr+   r]   attention_biasq_projk_projv_projo_projr1   r[   r   r4   s      r5   r*   Emu3Attention.__init__   sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r7   rB   position_embeddingsr   past_key_valuecache_positionr   r~   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S	5        O[         U R                  R                     nU" U U	U
UU4U R"                  (       d  S
OU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr:   r   r9   )rw   rv   r   eagersdpaoutput_attentionsF`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )rI   r   r   viewr   r   r   r|   updater   r   r[   _attn_implementationgetloggerwarning_oncer   r   r   r   r   r   r   )r1   rB   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rv   rw   cache_kwargsattention_interfacer   r   s                     r5   rE   Emu3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r7   )r   r[   r   r   r   r   r   r   r   r   r   NN)rN   rO   rP   rQ   __doc__r   intr*   r-   Tensorr   r   r   
LongTensorr   r   rE   rR   rS   rT   s   @r5   r   r      s    G
z 
c 
8 +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0) 0)r7   r   c                   v  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Emu3DecoderLayer   r[   r   c                 V  > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        R                  " UR                  5      U l        g )N)r[   r   r3   )r)   r*   r2   r   	self_attnrV   mlpr&   rms_norm_epsinput_layernormpost_attention_layernormr+   Dropoutr   r   r   s      r5   r*   Emu3DecoderLayer.__init__   s    !--&fJ6?*6+=+=6CVCVW(3F4F4FFL_L_(`%zz&":":;r7   rB   r   rx   r   r   	use_cacher   r   r~   c	                    Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pXR                  U5      -   nUn
U R                  U5      nU R	                  U5      nXR                  U5      -   nU4nU(       a  X4-  nU$ )at  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)rB   r   rx   r   r   r   r   r    )r   r   r   r   r   )r1   rB   r   rx   r   r   r   r   r   r   residualself_attn_weightsoutputss                r5   rE   Emu3DecoderLayer.forward   s    > !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !<<#>> !55mD/ <<#>> "++Gr7   )r   r2   r   r   r   r   )NNNFFNN)rN   rO   rP   rQ   r   r   r*   r-   r   r   r   r   boolr   FloatTensorrE   rR   rS   rT   s   @r5   r   r      s    	<z 	<c 	< 2637*.,1$)59KO<||< !.< u//0	<
 !< $D>< D>< !!1!12< &eELL%,,,F&GH< 
u  (51B1BEDUDU1U+V"WW	X< <r7   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )Emu3VQVAEVectorQuantizeri<  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r[   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        U R                  R                  R                  R                  SUR                  -  SUR                  -  5        g )Ng            ?)
r)   r*   r+   	Embeddingcodebook_size	embed_dim	embeddingr/   datauniform_rd   s     r5   r*   !Emu3VQVAEVectorQuantizer.__init__G  sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr7   hidden_statec                    UR                   u  p#pEnUR                  SSSSS5      R                  5       nUR                  SU5      n[        R
                  " US-  SSS9n[        R
                  " U R                  R                  S-  SS	9n	S[        R                  " XpR                  R                  R                  SS5      5      -  n
X-   U
-
  n
[        R                  " U
SS	9nUR                  X#XV5      nU$ )
Nr   r   r	      r9   r:   T)rm   r;   rl   )rI   permuter   r   r-   sumr   r/   r   r   argmin)r1   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r5   rE    Emu3VQVAEVectorQuantizer.forwardL  s    8D8J8J5
h#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;^^=R=R=\=\]^`a=bcc	$4y@	$||I1=388v]##r7   )r   )rN   rO   rP   rQ   r   r!   r*   r-   r   rE   rR   rS   rT   s   @r5   r   r   <  s+    e e
$ELL $ $r7   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvDownsamplei^  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr	   r9   r   kernel_sizestridepaddingr)   r*   r+   Conv2dconvr1   in_channelsr4   s     r5   r*   'Emu3VQVAEEncoderConvDownsample.__init___  %    IIkAaYZ[	r7   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r   r   r   constantr   )padmoder   )Fr  r  r1   rB   s     r5   rE   &Emu3VQVAEEncoderConvDownsample.forwardc  s+    mJVWX		-0r7   r  rj   rT   s   @r5   r   r   ^  s    \ r7   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvUpsampleij  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr	   r   r   r  r  s     r5   r*   %Emu3VQVAEEncoderConvUpsample.__init__k  r	  r7   c                 T    [         R                  " USSS9nU R                  U5      nU$ )N       @nearestscale_factorr  )r  interpolater  r  s     r5   rE   $Emu3VQVAEEncoderConvUpsample.forwardo  s(    m#IV		-0r7   r  rj   rT   s   @r5   r  r  j  s    \ r7   r  c            	       j   ^  \ rS rSrS\S\S\\   S\\   4U 4S jjrS\R                  4S jr	S	r
U =r$ )
Emu3VQVAEConv3diu  
in_channelout_channelr   r  c                 R  > [         T	U ]  5         [        USS  USS  5       VVs/ s H	  u  pVXV-
  PM     nnnSU l        US S S2    H&  nU =R                  US-  US-  -   US-  4-  sl        M(     U =R                  S-  sl        [        R
                  " UUUUS9U l        g s  snnf )Nr   r   r:   r9   )r9   r   )r  )r)   r*   zipr  r+   Conv3dr  )
r1   r  r   r   r  
one_kernel
one_stridepadding_sizespad_sizer4   s
            r5   r*   Emu3VQVAEConv3d.__init__v  s     	ORS^_`_aSbdjklkmdnOopOo5KZ0Oop%dd+HLLX]X\98q=IIL ,II	
	 qs   B#rB   c                 h    [         R                  " XR                  5      nU R                  U5      nU$ rg   )r  r  r  r  r  s     r5   rE   Emu3VQVAEConv3d.forward  s(    m\\:		-0r7   )r  r  )rN   rO   rP   rQ   r   r   r*   r-   r   rE   rR   rS   rT   s   @r5   r  r  u  sK    

 
 3Z	

 c

,U\\  r7   r  c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Emu3VQVAESpatialNormi  r  out_channelsc                    > [         TU ]  5         [        R                  " USSSS9U l        [        R
                  " UUSSSS9U l        [        R
                  " UUSSSS9U l        g )N    rM   Tnum_channels
num_groupsr3   affiner   r   r   )r)   r*   r+   	GroupNorm
norm_layerr  conv_yconv_br1   r  r-  r4   s      r5   r*   Emu3VQVAESpatialNorm.__init__  sn    
 	,,%	
 ii
 ii
r7   rB   quant_statesc                     [         R                  " X!R                  SS  SS9nU R                  U5      nXR	                  U5      -  U R                  U5      -   nU$ )Nr   r  )sizer  )r  r  rI   r5  r6  r7  )r1   rB   r:  s      r5   rE   Emu3VQVAESpatialNorm.forward  sT    }}\8K8KBC8PW`a6%L(AADKKP\D]]r7   )r7  r6  r5  rN   rO   rP   rQ   r   r*   r-   r   rE   rR   rS   rT   s   @r5   r,  r,    s:    

 
8U\\   r7   r,  c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalUpsamplei  r  r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )Nr	   r	   r	   r   r   r   r   r  r)   r*   r  r  r1   r  r   r4   s      r5   r*   "Emu3VQVAETemporalUpsample.__init__  (    
 	#!	
	r7   rB   c                 D   UR                   u  p#pEnUR                  SSSSS5      R                  5       R                  USU5      n[        R
                  " USSS	9nUR                  X#XVS5      R                  SSSSS5      R                  5       nU R                  U5      nU$ )
Nr   r   r	   r   r9   r:   r  r  r  )rI   r   r   r   r  r  r  )r1   rB   r   r   r   r   r   s          r5   rE   !Emu3VQVAETemporalUpsample.forward  s    8E8K8K5
h%--aAq!<GGINNz[]_ghm#IV%**:PRS[[\]_`bcefhijuuw		-0r7   r  r>  rT   s   @r5   r@  r@    s/    

 
U\\  r7   r@  c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalDownsamplei  r  r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )N)r   r	   r	   )r9   r   r   rD  rE  rF  s      r5   r*   $Emu3VQVAETemporalDownsample.__init__  rH  r7   rB   c                 (    U R                  U5      nU$ rg   r  r  s     r5   rE   #Emu3VQVAETemporalDownsample.forward  s    		-0r7   r  r>  rT   s   @r5   rL  rL    s/    

 
U\\  r7   rL  c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )Emu3VQVAETemporalResnetBlocki  c                 f  > [         TU ]  5         Xl        Uc  UOUU l        [        R
                  " U5      U l        [        UUSSS9U l        [        R
                  " U5      U l	        [        UUSSS9U l
        U R                  U R                  :w  a  [        R                  " UUSSSS9U l        g g )NrB  rC  rD  r   r   r   )r)   r*   r  r-  r+   BatchNorm3dnorm1r  conv1norm2conv2r#  nin_shortcutr8  s      r5   r*   %Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r7   c                 P   UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nX!-   $ rg   )	rU  r-   sigmoidrV  rW  rX  r  r-  rY  )r1   rB   r   s      r5   rE   $Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H''r7   )rV  rX  r  rY  rU  rW  r-  rg   rj   rT   s   @r5   rR  rR    s     @( (r7   rR  c                      ^  \ rS rSr  S	S\S\\   S\\   4U 4S jjjrS
S\R                  S\\R                     4S jjr	Sr
U =r$ )Emu3VQVAEResnetBlocki  r  r-  quant_channelsc                   > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Uc9  [
        R                  " USSSS9U l        [
        R                  " USSSS9U l        O [        X15      U l        [        X25      U l        [
        R                  " UUSSSS9U l        [
        R                  " UUSSSS9U l        U R                  U R                  :w  a  [
        R                  " UUSSSS9U l        g g )	Nr/  rM   Tr0  r	   r   r   r   )r)   r*   r  r-  r`  r+   r4  rU  rW  r,  r  rV  rX  rY  )r1   r  r-  r`  r4   s       r5   r*   Emu3VQVAEResnetBlock.__init__  s     	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nJDJ-nKDJYY

 YY

 t000 "		!D 1r7   rB   c                 |   U R                   c  SOU4nUnU R                  " U/UQ76 nU[        R                  " U5      -  nU R	                  U5      nU R
                  " U/UQ76 nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nXA-   $ Nr   )
r`  rU  r-   r\  rV  rW  rX  r  r-  rY  )r1   rB   r`  	norm_argsr   s        r5   rE   Emu3VQVAEResnetBlock.forward>  s    --5BN;L	 

==9=}55

=1

==9=}55

=1t000((2H''r7   )rV  rX  r  rY  rU  rW  r-  r`  r   rg   )rN   rO   rP   rQ   r   r   r*   r-   r   rE   rR   rS   rT   s   @r5   r_  r_    s_     '+(,	** sm* !	* *X(U\\ (8ELLCY ( (r7   r_  c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                     S\	\
   S\\R                  \	\R                     4   4S	 jjrS
rU =r$ )Emu3VQVAEAttentionBlockiP  r   r[   c                 .  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        SU l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   Fr   )r)   r*   r[   r2   r   r   	num_headsr   
ValueErrorscaler   r   r   r+   r]   r   r   r   out_projr   rd   s     r5   r*    Emu3VQVAEAttentionBlock.__init__S  s"   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A %&!r7   rB   r   r   r~   c                    UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                   (       d  SOU R"                  S9u  pUR%                  XEU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )	z#Input shape: Batch x Time x Channelr   r9   r   r   r   r   )r   r   r   N)rI   r   r   r   r   rj  r   r   r   r[   r   r   r   r   r   rl  r   r   r   r   rm  )r1   rB   r   r   r   
seq_lengthr   querieskeysvaluesr   r   r   s                r5   rE   Emu3VQVAEAttentionBlock.forwardj  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r7   )r[   r   r   r   r   r   rj  r   rm  r   rl  r   )NF)rN   rO   rP   rQ   r   r!   r*   r-   r   r   r   r   rE   rR   rS   rT   s   @r5   rh  rh  P  sr    G& &4 26,1	-)||-) !.-) $D>	-)
 
u||Xell33	4-) -)r7   rh  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )Emu3VQVAEGroupNormi  z
Same as the torch GroupNorm with the only difference that this ones accepts
an optional kwarg `quant_states` which is not used. This class makes it easier to
use SpatialNorm or GroupNorm without conditionals
c                 &   > [         TU ]  " S0 UD6  g rd  )r)   r*   )r1   r   r4   s     r5   r*   Emu3VQVAEGroupNorm.__init__  s    "6"r7   c                     [         R                  " XR                  U R                  U R                  U R
                  5      $ rg   )r  
group_normr2  r/   rZ   r3   )r1   inputr:  s      r5   rE   Emu3VQVAEGroupNorm.forward  s'    ||E??DKKDHHUUr7   r   rg   )	rN   rO   rP   rQ   r   r*   rE   rR   rS   rT   s   @r5   rv  rv    s    #V Vr7   rv  c                   p   ^  \ rS rSrSU 4S jjrSS\R                  S\\R                     4S jjrSr	U =r
$ )Emu3VQVAEMiddleBlocki  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        Uc  [        USSSS9U l        O[        X25      U l        [        UUUS9U l	        g )Nr  r-  r`  r/  rM   Tr0  )
r)   r*   r_  block_1rh  attn_1rv  	attn_normr,  block_2)r1   r[   r  r`  r4   s       r5   r*   Emu3VQVAEMiddleBlock.__init__  sm    +#$)

 .f5!/[UW]ajnoDN1.NDN+#$)
r7   rB   r:  c                 N   U R                  X5      nUnU R                  X5      nUR                  u  pEpgUR                  XEXg-  5      R	                  SS5      nU R                  U5      S   nUR                  XFXu5      R                  SSSS5      nX1-   nU R                  X5      nU$ )Nr   r9   r   r	   )	r  r  rI   r   r   r  r   r   r  )r1   rB   r:  r   r   r   r   r   s           r5   rE   Emu3VQVAEMiddleBlock.forward  s    ]A }C.;.A.A+
f%**:PZZ[\^_`M215%--j%RZZ[\^_abdef 0]Ar7   )r  r  r  r  rg   )rN   rO   rP   rQ   r*   r-   r   r   rE   rR   rS   rT   s   @r5   r~  r~    s1    
(
U%6%6 
huO`O`Fa 
 
r7   r~  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEDownBlocki  c                   > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nS[        U5      -   nX@l        [        R                  " 5       U l        [        U R                  5       GHL  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nX$U   -  n	X#U   -  n
[        U R
                  5       H~  nUR                  [        U	U
S95        U
n	UR                  c  M-  XQR                  ;   d  M>  UR                  [!        U5      5        UR                  [        R"                  " U	SSSS95        M     [        R$                  " 5       nXll        X|l        Xl        XPR                  S-
  :w  a  [-        U	5      Ul        U R                  R                  U5        GMO     g )N)r   r  r-  r/  rM   Tr0  r   )r)   r*   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrH   in_channel_multiplierr+   
ModuleListdownrangeappendr_  attn_resolutionsrh  r4  Moduleblockattn
attn_normsr   
downsample)r1   r[   r  r  r  i_levelr  r  r  block_in	block_outi_blockr  r4   s                r5   r*   Emu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112GMMOE==?DJ$W'EEH%7(CCI !4!45($,%. %**67F]F];]KK 7 ?@%%bllUW]ajn&op 6 99;DJI(O..22"@"JIIT"1 3r7   rB   c                 <   [        U R                  5       GH  u  p#[        U R                  5       H  nUR                  U   " U5      n[        UR                  5      S:  d  M3  UnUR                  U   " U5      nUR                  u  pgpUR                  XgX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XhX5      R                  SSSS5      nXQ-   nM     X R                  S-
  :w  d  M  UR                  U5      nGM     U$ )Nr   r   r9   r	   )	enumerater  r  r  r  r  r  r  rI   r   r   r   r   r  r  )
r1   rB   r  blocksr  r   r   r   r   r   s
             r5   rE   Emu3VQVAEDownBlock.forward  s   (3OG !4!45 &W 5m Dv{{#a',H$*$5$5g$>}$MM:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M 6 ..22 & 1 1- @  4" r7   )r  r  r  r  
rN   rO   rP   rQ   r*   r-   r   rE   rR   rS   rT   s   @r5   r  r    s     ##JU%6%6  r7   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Emu3VQVAEUpBlocki  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  UR                  S   -  n[        R                  " 5       U l
        [        [        U R                  5      5       GH8  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nUR                  UR                  U   -  n[        U R
                  S-   5       Hd  n	UR                  [        UUUS95        UnXAR                  ;   d  M0  UR                  [!        U5      5        UR                  [#        X#5      5        Mf     [        R$                  " 5       n
XZl        Xjl        Xzl        US:w  a  [-        U5      U
l        U R                  R1                  SU
5        GM;     g )Nr:   r   r  r   )r)   r*   r  r  r  r  r   r  r+   r  upreversedr  r  r_  r  rh  r,  r  r  r  r  r  upsampleinsert)r1   r[   r`  r  r  r  r  r  r  r  r  r4   s              r5   r*   Emu3VQVAEUpBlock.__init__  si   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;<GMMOE==?DJ,,v/H/H/QQI !4!4q!89($,%.'5 %555KK 7 ?@%%&:>&TU : BHG&M!|:8DGGNN1b!3 =r7   rB   r:  c                 b   [        U R                  S S S2   5       GH  u  p4[        U R                  S-   5       H  nUR                  U   " X5      n[        UR                  5      S:  d  M3  UnUR                  U   " X5      nUR                  u  pxpUR                  XxX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XyX5      R                  SSSS5      nXa-   nM     U[        U R                  5      S-
  :w  d  M  UR                  U5      nGM     U$ )Nr:   r   r   r9   r	   )r  r  r  r  r  r  r  r  rI   r   r   r   r   r  )r1   rB   r:  r  r  r  r   r   r   r   r   s              r5   rE   Emu3VQVAEUpBlock.forward+  s   (27OG !4!4q!89 &W 5m Rv{{#a',H$*$5$5g$>}$[M:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M : #dgg,** & >  8  r7   )r  r  r  r  rT   s   @r5   r  r    s-    #"JU%6%6 eFWFW  r7   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEEncoderi?  c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nU(       a  SU-  OUnX&S   -  n[        R                  R                  X2SSSS9U l
        [        U5      U l        [        X5      U l        [        R                  R                  SUSSS	9U l        [        R                  R                  UUSSSS9U l        [%        [&        R(                  " UR*                  5      5      n	[        R,                  " 5       U l        [        R,                  " 5       U l        [3        U	5       H)  n
[5        Xw5      nU R.                  R7                  U5        M+     [3        UR8                  5       H(  n[;        UUS
9nU R0                  R7                  U5        M*     g )Nr9   r:   r	   r   r   r/  rM   T)r2  r1  r3   r3  r  )r)   r*   r  r  double_latentlatent_channelsr  r-   r+   r  conv_inr  
down_blockr~  middle_blockr4  norm_outconv_outr   mathlog2temporal_downsample_factorr  	time_convtime_res_stackr  rL  r  r  rR  )r1   r[   r  r  r  r  r  r-  r  temporal_down_blocksir  _time_res_convr4   s                 r5   r*   Emu3VQVAEEncoder.__init__@  s   ,,((,, 00#66.;q?* b#99xx{qYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+,A.|JDNN!!$' - v,,-A8()M &&}5 .r7   pixel_valuesc                 t   UR                   S   nUR                  " S/UR                   SS  Q76 nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nUR                  " SU/UR                   SS  Q76 nUR                  SSSSS5      nU R                   H$  nU" U5      nU[        R                  " U5      -  nM&     U R                   H  nU" U5      nM     UR                  SSSSS5      nU$ )Nr   r:   r9   r   r	   r   )rI   r   r  r  r  r  r-   r\  r  r   r  r  )r1   r  temporal_dimrB   r  layers         r5   rE   Emu3VQVAEEncoder.forwardg  s:   #))!,#++BH1C1CAB1GH \26))-8 m4}55m4%--b,YATATUVUWAXY%--aAq!< NND /MU]]=99M # ((E!-0M ) &--aAq!<r7   )r  r  r  r  r  r  r  )
rN   rO   rP   rQ   r*   r-   r   rE   rR   rS   rT   s   @r5   r  r  ?  s     %6NE$4$4  r7   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Emu3VQVAEDecoderi  r[   c                   > [         T	U ]  5         UR                  nUR                  UR                  S   -  n[
        R                  " 5       U l        [        UR                  5       H<  n[        UR                  UR                  S9nU R                  R                  U5        M>     [        [        R                  " UR                   5      5      n[
        R                  " 5       U l        [        U5       H>  n[%        UR                  UR                  5      nU R"                  R                  U5        M@     [
        R&                  " UR                  USSSS9U l        [+        XUS9U l        [/        U5      U l        UR                  UR                  S   -  n[3        X#5      U l        [
        R&                  " UUR6                  SSSS9U l        g )Nr:   r  r	   r   r   )r`  r   )r)   r*   r   r  r  r+   r  r  r  r  rR  r  r  r   r  r  r  r  r@  r  r  r~  r  r  up_blockr,  r  r-  r  )
r1   r[   r`  r  r  r  temp_upsample_block_numr  r  r4   s
            r5   r*   Emu3VQVAEDecoder.__init__  s|   ))''&*C*CB*GG mmov,,-A8"22AWAWM &&}5	 . #&dii0Q0Q&R"S./A,V-C-CVE[E[\DNN!!$' 0 yy""
 1R`a(0''&*C*CA*FF,^F		
r7   rB   r:  c                    [         R                  " X4SS9nUR                  SSSSS5      nU R                   H  nU" U5      nM     U R                   H$  nU" U5      nU[         R
                  " U5      -  nM&     UR                  SSSSS5      n[         R                  " USSS9u  pUR                  " S/UR                  SS  Q76 nUR                  " S/UR                  SS  Q76 nU R                  U5      nU R                  X5      nU R                  X5      nU R                  X5      nU[         R
                  " U5      -  nU R                  U5      nU$ )Nr   rl   r9   r   r	   r   r:   )r-   rn   r   r  r  r\  chunkr   rI   r  r  r  r  r  )r1   rB   r:  hidden_quant_statesr  s        r5   rE   Emu3VQVAEDecoder.forward  sV   #ii(E1M199!Q1aH ((E"'(;"< ) ^^E"'(;"<5==1D#EE $ 299!Q1aH&+kk2Eqa&P#%--bK=3F3Fqr3JK#++BH1C1CAB1GH]3 ))-FmBmB}55m4r7   )r  r  r  r  r  r  r  )rN   rO   rP   rQ   r!   r*   r-   r   rE   rR   rS   rT   s   @r5   r  r    s0    %
 %
NU\\   r7   r  aF  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                      ^  \ rS rSr\rSrSrSrSr	Sr
Sr/ SQrS rS\4U 4S jjrS\R                   S	\R                   4S
 jrS\R                   4S jrSrU =r$ )	Emu3VQVAEi  
emuvideovqr  T)rR  rh  r_  r   c                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  SSS9  UR                  bq  [        R                  R                  UR                  5      u  p#S[        R                  " U5      -  n[        R                  R                  UR                  U* U5        g g [        U[        R                  5      (       a  [        R                  R                  UR                  [        R                  " S5      S9  UR                  by  [        R                  R                  UR                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  R                  UR                  U* U5        g g [        U[        R                  [        R                  [        R                   45      (       aU  [        R                  R#                  UR                  S5        [        R                  R#                  UR                  S	5        g [        U[        R$                  5      (       ad  UR                  R&                  R)                  5         UR*                  b2  UR                  R&                  UR*                     R-                  5         g g g )
Nfan_outrelu)r  nonlinearityr      )ar   r   r   )
isinstancer+   r  r#  initkaiming_normal_r/   rZ   _calculate_fan_in_and_fan_outr  sqrtr   r]   kaiming_uniform_BatchNorm2drT  r4  	constant_r   r   normal_padding_idxzero_)r1   r   fan_inr  bounds        r5   _init_weightsEmu3VQVAE._init_weights  s   fryy"))455GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		**GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOOGGfmmS1GGfkk3/--MM&&(!!-""6#5#56<<> . .r7   r[   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        S[        UR                  5      S-
  -  U l        [        UR                  UR                  SSS9U l        [        UR                  UR                  SSS9U l        S[        UR                  5      S-
  -  U l        U R%                  5         U R'                  5         g )Nr9   r   )r	   r   r   rC  rD  )r)   r*   r[   r  encoderr  decoderr   quantizer  r  vision_spatial_factorr  r  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initrd   s     r5   r*   Emu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r7   image_sizesc                    UR                   S:H  nU(       aJ  U R                  R                  nUR                  u  pVpxUR	                  S5      R                  SUSSS5      nOUR                  u  pTpgnU R                  U5      n	U	R                  SSSSS5      n	U R                  U	5      n	U	R                  SSSSS5      n	U R                  U	5      n
U(       a  U
R                  S5      OU
n[        X5       VVs/ s HB  u  pUS [        US   U R                  -  5      2S [        US   U R                  -  5      24   PMD     nnnU$ s  snnf )Nr   r   r   r9   r	   )ndimr[   r  rI   rs   repeatr  r   r  r  squeezer"  r   r  )r1   r  r  is_imager   r   r   r   r   rB   codesimage_tokenssingle_imager<  s                 r5   encodeEmu3VQVAE.encode  sP   $$){{==H2>2D2D/J&'11!4;;AxAqQL<H<N<N9J(E\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
&D" D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr&D 	 

 
s   6A	ErB   c                    UR                   S:H  nU(       a  UR                  S5      nUR                  u  p4pVU R                  R	                  UR                  5       5      nUR                  S   nUR                  X4XVU5      R                  SSSSS5      R                  5       nU R                  U5      n	UR                  SSSSS5      nU	R                  SSSSS5      n	U R                  X5      n
U
R                  UX@R                  R                  -  U R                  R                  XPR                  -  X`R                  -  5      n
U(       a	  U
S S 2S4   $ U
$ )Nr	   r   r:   r   r   r9   )r  rs   rI   r  r   flattenr   r   r   r  r  r   r[   r  r-  r  )r1   rB   r  r   r   r   r   quantr   
post_quantvideos              r5   decodeEmu3VQVAE.decode'  s;    %%*)33A6M.;.A.A+
f''(=(=(?@;;r?

:IQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/{{===KK$$...---
 'uQT{1E1r7   )r[   r  r  r  r  r  r  r  )rN   rO   rP   rQ   r!   config_classbase_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backend_no_split_modulesr  r*   r-   r   r   r  rR   rS   rT   s   @r5   r  r    su     #L$$ON!"&?* *5<< ell 82ELL 2 2r7   r  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\\R                     S\R                  4S jrS
\R                  S\R                  4S jrSrg)Emu3ImageVocabularyMappingiA  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 h    Xl         UR                  S5      U l        UR                  S5      U l        g )Nz<|extra_200|>z<image>)	vocab_mapr   eol_token_idimage_token_id)r1   r  s     r5   r*   #Emu3ImageVocabularyMapping.__init__F  s)    "%MM/:'mmI6r7   c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf Nz<|visual tokensortedr  items
startswithr1   namevals      r5   r  'Emu3ImageVocabularyMapping.image_tokensK  s<    DNN,@,@,Bh,BytdooVfFgs,Bhiih   A
A
c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf r  r  r  s      r5   image_tokens_str+Emu3ImageVocabularyMapping.image_tokens_strO  s<    T^^-A-A-Ci-C	tWgGht-Cijjir"  c                 z    U R                    Vs0 s H  n[        USS 5      U R                  U   _M!     sn$ s  snf )Nir   )r$  r   r  )r1   tokens     r5   img2bpe"Emu3ImageVocabularyMapping.img2bpeS  s;    FJF[F[\F[UE"RL!4>>%#88F[\\\s   &8c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf rg   )r(  r  )r1   ru   vs      r5   bpe2img"Emu3ImageVocabularyMapping.bpe2imgW  s-    !%!3!3!56!5!5666s   0c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ Nr   r<   )r-   zerosmaxr,  rr  r   r  r1   mappingru   r+  s       r5   bpe2img_mapping_tensor1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor[  R    ++c$,,"3"3"56:%))LLL&&(DAAJ )r7   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ r/  )r-   r1  r2  r(  rr  r   r  r3  s       r5   img2bpe_mapping_tensor1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorb  r7  r7   	img_batchr~   c                 "   UR                   n[        R                  " UR                  S   S4[        R                  S9U R
                  -  nU R                  UR                  S5         n[        R                  " XC/SS9nUR                  U5      $ )Nr   r   r0  cpur:   rl   )	devicer-   r.   rI   r   r  r9  r=   rn   )r1   r;  r>  eol_row
img_tokenss        r5   convert_img2bpe*Emu3ImageVocabularyMapping.convert_img2bpei  su    !!**iooa0!4EIIFIZIZZ00e1DE
YY
4"=
}}V$$r7   c                     UR                   nUSS S24   nU R                  UR                  S5         nUR                  U5      $ )N.r:   r=  )r>  r5  r=   )r1   r;  r>  r@  s       r5   convert_bpe2img*Emu3ImageVocabularyMapping.convert_bpe2imgp  sG    !!c3B3h'	00e1DE
}}V$$r7   )r  r  r  N)rN   rO   rP   rQ   r   r*   r   r  r$  r(  r,  r5  r9  r   r-   r   rA  rD  rR   r   r7   r5   r  r  A  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r7   r  c                   T    \ rS rSr\rSrSrS/rSS/r	Sr
SrSrSrSrSrSrSrS rS	rg
)Emu3PreTrainedModeliw  modelTr   past_key_valuesr   Fc                    U R                   R                  5       R                  n[        U[        R
                  [        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g g )Nr   )r@   stdr   )r[   get_text_configinitializer_ranger  r+   r]   r  r/   r   r  rZ   r  r   r  r&   fill_)r1   r   rK  s      r5   r  !Emu3PreTrainedModel._init_weights  s    kk))+==fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .,,MM$$S) -r7   r   N)rN   rO   rP   rQ   r   r	  r
  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignmentr  r  r  rR   r   r7   r5   rG  rG  w  s_    L&*# $5m"D!N $ !(-%"&*r7   rG  c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Emu3RotaryEmbeddingi  r[   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r)   r*   hasattrrY  r   rZ  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr[   r   rope_init_fnattention_scalingregister_bufferr]  original_inv_freq)r1   r[   r>  r]  r4   s       r5   r*   Emu3RotaryEmbedding.__init__  s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r7   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r:   r   mpsr=  F)device_typeenabledr9   rl   r0  )r]  floatr   rI   r=   r>  r  r[  strr-   autocastr   rn   rv   rd  rw   r<   )
r1   rh   rx   inv_freq_expandedposition_ids_expandedrj  freqsembrv   rw   s
             r5   rE   Emu3RotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)rd  r[   ra  rf  rb  rc  rZ  rg   )rN   rO   rP   rQ   r   r*   r-   no_gradr   rE   rR   rS   rT   s   @r5   rW  rW    s6    /z / /" ]]_<  <r7   rW  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	         SS\
\R                     S\
\R                     S\
\R                     S	\
\   S
\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\   S\4S jj5       5       r SS\\R                  S4   S\R                  S\R                  S	\S\4
S jjr\S\R                  S\S\S\R2                  S\R                  S\4S j5       rSrU =r$ )Emu3TextModeli  r[   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   )r[   F)r)   r*   pad_token_idr  
vocab_sizer+   r   r2   embed_tokensr  r  num_hidden_layersr   layersr&   r   normrW  
rotary_embgradient_checkpointingr  r   s      r5   r*   Emu3TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabBaYf0Bab
   2 28K8KL	-V<&+# 	 cs   C?c                     U R                   $ rg   rz  rJ   s    r5   get_input_embeddings"Emu3TextModel.get_input_embeddings  s       r7   c                     Xl         g rg   r  r1   r   s     r5   set_input_embeddings"Emu3TextModel.set_input_embeddings  s    !r7   	input_idsr   rx   rI  inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr~   c
                 J   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        Sn[        U[        S 5      [        45      (       d  [	        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                   " XUR"                  S   -   UR$                  S9n	Uc  U	R'                  S5      nU R)                  X%XU5      nUnU R+                  X5      nU(       a  SOS nU(       a  SOS nU R,                  S U R                   R.                    H7  nU(       a  X4-  nU" U4UUUUUU	US	.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R1                  U5      nU(       a  X4-  n[3        UU(       a  UOS UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   r>  r   )r   rx   r   r   r   r   r   )last_hidden_staterI  rB   
attentions)r[   r   r  r   rk  r  r   r   r   r  r[  r   rz  r   get_seq_lengthr-   arangerI   r>  rs   _update_causal_maskr~  r|  r{  r}  r   )r1   r  r   rx   rI  r  r   r   r  r   r  past_seen_tokensr   rB   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r5   rE   Emu3TextModel.forward  sI    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I /DJ+>??abb  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r7   r"   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fr   )r  past_key_values_lengthis_trainingr   r:   )sequence_lengthtarget_lengthr<   r   r   )cudaxpunpu)r[   r   anyr  r-   r   r#   r  is_compileabler   _ignore_causal_mask_sdpar   r<   rI   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr>  r[  finfomin_unmask_unattended)r1   r   r  r   rI  r   r  using_compilable_cacher<   r  r  r   	min_dtypes                r5   r  !Emu3TextModel._update_causal_mask1  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr7   r  r  r<   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ 	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr   )
fill_valuer<   r>  r   )diagonalr  r:   r   rm   r-   r  r  fullr>  triur  r   r   clonerI   r=   masked_fillr   r  r  r<   r   r   r   r   r  mask_lengthpadding_masks              r5   r  CEmu3TextModel._prepare_4d_causal_attention_mask_with_cache_positionu  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r7   )rz  r  r|  r}  r  r~  ry  )	NNNNNNNNN)F)rN   rO   rP   rQ   r   r*   r  r  r   r   r   r-   r   r   r   r   r   r   r   r   rE   r   r  staticmethodr   r<   r  rR   rS   rT   s   @r5   rv  rv    s   z  !"  151537+/59$(,0/359\
E,,-\
 !.\
 u//0	\

 "%\
   1 12\
 D>\
 $D>\
 'tn\
 !!1!12\
 $$89\
 
!\
  \
H #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r7   rv  c                       \ rS rSrSrg)KwargsForCausalLMi  r   N)rN   rO   rP   rQ   rR   r   r7   r5   r  r    s    3r7   r  c                     ^  \ rS rSrS/rSS0rSS/S/40r\rU 4S jr	S r
S	 rS
 rS rS rS r\\           SS\\R(                     S\\R*                     S\\R(                     S\\   S\\R.                     S\\R(                     S\\   S\\   S\\   S\\R(                     S\\\R*                  4   S\\   S\4S jj5       5       rSrU =r $ )Emu3ForCausalLMi  zlm_head.weightlm_headcolwise_reprB   logitsc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g NFrY   )
r)   r*   rv  rH  ry  r+   r]   r2   r  r  rd   s     r5   r*   Emu3ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r7   c                 .    U R                   R                  $ rg   rH  rz  rJ   s    r5   r  $Emu3ForCausalLM.get_input_embeddings  s    zz&&&r7   c                 $    XR                   l        g rg   r  r  s     r5   r  $Emu3ForCausalLM.set_input_embeddings  s    "'

r7   c                     U R                   $ rg   r  rJ   s    r5   get_output_embeddings%Emu3ForCausalLM.get_output_embeddings  s    ||r7   c                     Xl         g rg   r  )r1   new_embeddingss     r5   set_output_embeddings%Emu3ForCausalLM.set_output_embeddings  s    %r7   c                     Xl         g rg   rH  )r1   r  s     r5   set_decoderEmu3ForCausalLM.set_decoder  s    
r7   c                     U R                   $ rg   r  rJ   s    r5   get_decoderEmu3ForCausalLM.get_decoder  s    zzr7   r  r   rx   rI  r  labelsr   r   r  r   logits_to_keepr   r~   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```N)	r  r   rx   rI  r  r   r   r  r   r  r  ry  lossr  rI  rB   r  r   )r[   r   r  rH  r  r  r   slicer  loss_functionry  r   rI  rB   r  )r1   r  r   rx   rI  r  r  r   r   r  r   r  r   r   rB   slice_indicesr  r  s                     r5   rE   Emu3ForCausalLM.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD%#33!//))
 	
r7   )r  rH  ry  )NNNNNNNNNNr   )!rN   rO   rP   rQ   _tied_weights_keys_tp_plan_pp_planr    r	  r*   r  r  r  r  r  r  r   r   r   r-   r   r   r   r   r   r   r   r   r  r   rE   rR   rS   rT   s   @r5   r  r    s   *+=)H_-z:;H!L'(&  151537+/59-1$(,0/35934G
E,,-G
 !.G
 u//0	G

 "%G
   1 12G
 ))*G
 D>G
 $D>G
 'tnG
 !!1!12G
 c5<</0G
 *+G
 
 G
  G
r7   r  c            !       @  ^  \ rS rSrSS0rSrU 4S jrS rS rS\	R                  S	\	R                  4S
 jrS\	R                  S	\	R                  4S jr\	R                  S\	R                  S\S\4S j5       r\\            SS\	R                  S\	R                  S	\	R&                  S\\	R&                     S\\	R                     S\\   S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )	Emu3Modeli  ztext_model.model
text_modelFc                   > [         TU ]  U5        [        R                  UR                  5      U l        U R
                  R                  b/  U R
                  R                   Vs/ s H  nSU 3PM
     snU l        [        UR                  5      U l	        [        UR                  5      U l        U R                  5         g s  snf )Nztext_model.)r)   r*   rv  _from_configtext_configr  r  r  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  )r1   r[   ru   r4   s      r5   r*   Emu3Model.__init__"  s     '44V5G5GH??--9BF//BdBd&eBdQQC'8Bd&eD# !1!12"<V=R=R"S 	 'fs   $C c                 6    U R                   R                  5       $ rg   )r  r  rJ   s    r5   r  Emu3Model.get_input_embeddings.  s    3355r7   c                 :    U R                   R                  U5        g rg   )r  r  r  s     r5   r  Emu3Model.set_input_embeddings1  s    ,,U3r7   r  r  c                 N    [         R                  S5        U R                  U5      $ )Nz`model.get_image_tokens()` is deprecated and will be removed in v4.58. To obtain discrete token use `model.get_image_features()`)r   warningget_image_featues)r1   r  r  s      r5   get_image_tokensEmu3Model.get_image_tokens4  s'     O	
 %%l33r7   c                     U R                   R                  X5      nU Vs/ s H+  o@R                  R                  U5      R	                  5       PM-     nn[
        R                  " U5      nU$ s  snf )a  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
        The sizes of the images in the batch, being (height, width) for each image.
)r  r   r  rA  r  r-   rn   )r1   r  r  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r5   get_image_featuresEmu3Model.get_image_features:  sb     !LL//JctuctY_22BB6JRRTctuYY/
 vs   2A,r  r   r   c                     USS2SS24   R                  SX#S-   5      nU R                  R                  U5      nU R                  R	                  U5      nU$ )a  
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.

Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
    height (`int`):
        Height of the generated image before upsampling.
    width (`int`):
        Width of the generated image before upsampling.
Nr:   r   )r   r  rD  r  r  )r1   r  r   r   	sequencesimages         r5   decode_image_tokensEmu3Model.decode_image_tokensK  sV     !CRC(--b&!)D	..>>yI##L1r7   r  r   rx   rI  r  r   r   r  return_dictr   r   r~   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUSL USL-  (       a  [	        S5      eUb  Ub  [	        S5      eUb`  U R                  X#5      nXR                  R                  :H  nUR                  UR                  UR                  5      nUR                  X5      nU R                  " SUUUUUUU	U
SUS.
UD6nU$ )aH  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT
r  r   rx   rI  r  r   r   r  r
  r   r   )r[   r   r  use_return_dictrk  r  r  r  r=   r>  r<   masked_scatterr  )r1   r  r  r  r   rx   rI  r  r   r   r  r
  r   r   r  special_image_maskr   s                    r5   rE   Emu3Model.forward^  s-   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<s  #(Av  #22<ML!*.E.E.T.T!T'??9+;+;Y__ML!001CRI // 
)%+'/!5)
 
 r7   )r  r  r  r  )NNNNNNNNNNNN)rN   rO   rP   rQ   _checkpoint_conversion_mappingrT  r*   r  r  r-   r   r   r  r  rt  r   r  r   r   r   r   r   r   r   r   r   r   r   rE   rR   rS   rT   s   @r5   r  r    s   &8,%G""
644U->-> 4UM]M] 4u/@/@ uO_O_ " ]]0@0@ # VY  $  '+*.$(1537+/59$(,0/3&*59;##; ''; \\	;
 !.; u//0; "%;   1 12; D>; $D>; 'tn; d^; !!1!12; -.; 
u,,	-;  ;r7   r  c            %       v  ^  \ rS rSrSrSSSS.rSrU 4S jrS	 rS
 r	\
S 5       r\
S 5       r\\              S%S\R                   S\R"                  S\R$                  S\\R$                     S\\R                      S\\   S\\R"                     S\\   S\\   S\\   S\\   S\\R                      S\\R                      S\\\R$                  4   S\\   S\\\4   4 S jj5       5       r       S&U 4S jjr\S\R$                  S\S \S!\R>                  S\R$                  S"\4S# j5       r S$r!U =r"$ )'Emu3ForConditionalGenerationi   zmodel.text_modelzmodel.vqmodelr  )z^text_model.modelz^vqmodelz^text_model.lm_headFc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r  )r)   r*   r  rH  r+   r]   r  r2   ry  r  r  rd   s     r5   r*   %Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr7   c                 6    U R                   R                  5       $ rg   )rH  r  rJ   s    r5   r  1Emu3ForConditionalGeneration.get_input_embeddings  s    zz..00r7   c                 :    U R                   R                  U5        g rg   )rH  r  r  s     r5   r  1Emu3ForConditionalGeneration.set_input_embeddings  s    

''.r7   c                 .    U R                   R                  $ rg   )rH  r  rJ   s    r5   r  'Emu3ForConditionalGeneration.text_model  s    zz$$$r7   c                 .    U R                   R                  $ rg   )rH  r  rJ   s    r5   r  $Emu3ForConditionalGeneration.vqmodel  s    zz!!!r7   r  r  r  r   rx   rI  r  r   r   r  r
  r   r  r  r   r~   c                 ,   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUU	U
SUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )aL  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
...     {
...     "role": "system",
...     "content": [
...         {"type": "text", "text": "You are a helpful assistant."},
...         ],
...     },
...     {
...     "role": "user",
...     "content": [
...         {"type": "image"},
...         {"type": "text", "text": "Please describe the image."},
...         ],
...     },
... ]

>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
>>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

>>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```NTr  r   r  r  r   )r[   r   r  r  rH  r  r   r  r  r  r  ry  r   rI  rB   r  )r1   r  r  r  r   rx   rI  r  r   r   r  r
  r   r  r  r   r   rB   r  r  r  s                        r5   rE   $Emu3ForConditionalGeneration.forward  s>   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
)%+'/!5)
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD &#33!//))
 	
r7   c	                 V   > [         TU ]  " U4UUUUUUUS.U	D6n
US   S:w  a  S U
S'   U
$ )N)rI  r   r  r   rx   r  r   r   r  )r)   prepare_inputs_for_generation)r1   r  rI  r   r  r   rx   r   r  r   model_inputsr4   s              r5   r"  :Emu3ForConditionalGeneration.prepare_inputs_for_generation%  sZ     w<

+)')%%

 

 !!+/L(r7   r  r  r<   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ r  r  r  s              r5   r  REmu3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_positionD  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r7   )r  rH  )NNNNNNNNNNNNNr   )NNNNNTN)#rN   rO   rP   rQ   r
  r  rT  r*   r  r  propertyr  r  r   r   r-   r   r   r   r   r   r   r   r   r   r  r   r   rE   r"  r  r<   r  rR   rS   rT   s   @r5   r  r    s>   /#(&"
 #1/ % % " "  '+*.$(1537+/59$(,0/3&*59-134d
##d
 ''d
 \\	d

 !.d
 u//0d
 "%d
   1 12d
 D>d
 $D>d
 'tnd
 d^d
 !!1!12d
 ))*d
 c5<</0d
  *+!d
" 
u,,	-#d
  d
R > 444 4 {{	4
 4 4 4r7   r  )r  r  rv  rG  r  r  )Nr   )r   )ar  	functoolsr   typingr   r   r   r   r   r-   torch.nnr+   torch.nn.functionalr   r  activationsr
   cache_utilsr   r   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   configuration_emu3r   r    r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrN   r   r  r&   rV   rq   r|   r   r   r   rl  r   r   r   r   r   r  r  r,  r@  rL  rR  r_  rh  r4  rv  r~  r  r  r  r  r  r  rG  rW  rv  r  r  r  r  __all__r   r7   r5   <module>r=     sH  .  % 9 9     ! . ) 7 > B 9 O K F & h h K K  !!;J 
		H	% Y'J")) J (J(bii  (6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4J)BII J)ZH1 HV$ryy $D	RYY 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~G)bii G)TV V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l */ * *><")) <D p' p pf ?,j > j
)? j
 j
Z}# }@\#6 \~r7   