
    fTh                     p   S SK r S SKJr  S SKJrJrJrJr  S SKrS SK	J
r
  S SKJ
s  Jr  S SKrSSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  \RZ                  " \.5      r/ " S S\$5      r0 " S S\
Rb                  5      r2 " S S\!5      r3 " S S\
Rb                  5      r4 " S S\
Rb                  5      r5 " S S\
Rb                  5      r6 " S S\
Rb                  5      r7 " S  S!\
Rb                  5      r8 " S" S#\
Rb                  5      r9 " S$ S%\
Rb                  5      r: " S& S'\(5      r; " S( S)\
Rx                  5      r= " S* S+\
Rb                  5      r> " S, S-\
Rb                  5      r? " S. S/\
Rb                  5      r@ " S0 S1\
Rb                  5      rA " S2 S3\
Rb                  5      rB\" S4S59 " S6 S7\5      5       rC " S8 S95      rD " S: S;\ \C5      rE " S< S=\&\E5      rF " S> S?\%\E\5      rG " S@ SA\E5      rH " SB SC\E\5      rI/ SDQrJg)E    N)cached_property)ListOptionalTupleUnion   )Cache)GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)KwargsForCausalLMLlamaDecoderLayerLlamaForCausalLM
LlamaModel)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   v  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Emu3DecoderLayer+   config	layer_idxc                 n   > [         TU ]  X5        [        R                  " UR                  5      U l        g N)super__init__nnDropoutattention_dropoutdropoutselfr!   r"   	__class__s      ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/emu3/modular_emu3.pyr&   Emu3DecoderLayer.__init__,   s&    +zz&":":;    hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pXR                  U5      -   nUn
U R                  U5      nU R	                  U5      nXR                  U5      -   nU4nU(       a  X4-  nU$ )at  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r1   r2   r3   r4   r5   r6   r7   r8    )input_layernorm	self_attnr*   post_attention_layernormmlp)r,   r1   r2   r3   r4   r5   r6   r7   r8   kwargsresidualself_attn_weightsoutputss                r.   forwardEmu3DecoderLayer.forward0   s    > !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !<<#>> !55mD/ <<#>> "++Gr0   )r*   )NNNFFNN)__name__
__module____qualname____firstlineno__r   intr&   torchTensorr   
LongTensorr	   boolr   FloatTensorrD   __static_attributes____classcell__r-   s   @r.   r   r   +   s    <z <c < 2637*.,1$)59KO<||< !.< u//0	<
 !< $D>< D>< !!1!12< &eELL%,,,F&GH< 
u  (51B1BEDUDU1U+V"WW	X< <r0   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )Emu3VQVAEVectorQuantizero   a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
r!   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        U R                  R                  R                  R                  SUR                  -  SUR                  -  5        g )Ng            ?)
r%   r&   r'   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r,   r!   r-   s     r.   r&   !Emu3VQVAEVectorQuantizer.__init__z   sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr0   hidden_statec                    UR                   u  p#pEnUR                  SSSSS5      R                  5       nUR                  SU5      n[        R
                  " US-  SSS9n[        R
                  " U R                  R                  S-  SS	9n	S[        R                  " XpR                  R                  R                  SS5      5      -  n
X-   U
-
  n
[        R                  " U
SS	9nUR                  X#XV5      nU$ )
Nr   r   r      r   T)dimkeepdimre   )shapepermute
contiguousviewrK   sumr[   r\   matmul	transposeargmin)r,   ra   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r.   rD    Emu3VQVAEVectorQuantizer.forward   s    8D8J8J5
h#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;^^=R=R=\=\]^`a=bcc	$4y@	$||I1=388v]##r0   )r[   )rF   rG   rH   rI   __doc__r   r&   rK   rL   rD   rP   rQ   rR   s   @r.   rT   rT   o   s+    e e
$ELL $ $r0   rT   c                       \ rS rSrSrg)Emu3VQVAEEncoderConvDownsample   r;   N)rF   rG   rH   rI   rP   r;   r0   r.   r}   r}      s    r0   r}   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Emu3VQVAEEncoderConvUpsample   c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr   r   kernel_sizestridepadding)r%   r&   r'   Conv2dconv)r,   in_channelsr-   s     r.   r&   %Emu3VQVAEEncoderConvUpsample.__init__   s%    IIkAaYZ[	r0   c                 T    [         R                  " USSS9nU R                  U5      nU$ )N       @nearestscale_factormode)Finterpolater   r,   r1   s     r.   rD   $Emu3VQVAEEncoderConvUpsample.forward   s(    m#IV		-0r0   r   rF   rG   rH   rI   r&   rD   rP   rQ   rR   s   @r.   r   r      s    \ r0   r   c            	       j   ^  \ rS rSrS\S\S\\   S\\   4U 4S jjrS\R                  4S jr	S	r
U =r$ )
Emu3VQVAEConv3d   
in_channelout_channelr   r   c                 R  > [         T	U ]  5         [        USS  USS  5       VVs/ s H	  u  pVXV-
  PM     nnnSU l        US S S2    H&  nU =R                  US-  US-  -   US-  4-  sl        M(     U =R                  S-  sl        [        R
                  " UUUUS9U l        g s  snnf )Nr   r;   rd   r   )r   r   )r   )r%   r&   zipr   r'   Conv3dr   )
r,   r   r   r   r   
one_kernel
one_stridepadding_sizespad_sizer-   s
            r.   r&   Emu3VQVAEConv3d.__init__   s     	ORS^_`_aSbdjklkmdnOopOo5KZ0Oop%dd+HLLX]X\98q=IIL ,II	
	 qs   B#r1   c                 h    [         R                  " XR                  5      nU R                  U5      nU$ r$   )r   padr   r   r   s     r.   rD   Emu3VQVAEConv3d.forward   s(    m\\:		-0r0   )r   r   )rF   rG   rH   rI   rJ   r   r&   rK   rL   rD   rP   rQ   rR   s   @r.   r   r      sK    

 
 3Z	

 c

,U\\  r0   r   c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Emu3VQVAESpatialNorm   r   out_channelsc                    > [         TU ]  5         [        R                  " USSSS9U l        [        R
                  " UUSSSS9U l        [        R
                  " UUSSSS9U l        g )N    ư>Tnum_channels
num_groupsepsaffiner   r   r   )r%   r&   r'   	GroupNorm
norm_layerr   conv_yconv_br,   r   r   r-   s      r.   r&   Emu3VQVAESpatialNorm.__init__   sn    
 	,,%	
 ii
 ii
r0   r1   quant_statesc                     [         R                  " X!R                  SS  SS9nU R                  U5      nXR	                  U5      -  U R                  U5      -   nU$ )Nr   )sizer   )r   r   rh   r   r   r   )r,   r1   r   s      r.   rD   Emu3VQVAESpatialNorm.forward   sT    }}\8K8KBC8PW`a6%L(AADKKP\D]]r0   )r   r   r   rF   rG   rH   rI   rJ   r&   rK   rL   rD   rP   rQ   rR   s   @r.   r   r      s:    

 
8U\\   r0   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalUpsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )Nr   r   r   r   r   r   r   r   r%   r&   r   r   r,   r   r   r-   s      r.   r&   "Emu3VQVAETemporalUpsample.__init__   (    
 	#!	
	r0   r1   c                 D   UR                   u  p#pEnUR                  SSSSS5      R                  5       R                  USU5      n[        R
                  " USSS	9nUR                  X#XVS5      R                  SSSSS5      R                  5       nU R                  U5      nU$ )
Nr   r   r   rc   r   rd   r   r   r   )rh   ri   rj   rk   r   r   r   )r,   r1   rp   rr   rq   rs   rt   s          r.   rD   !Emu3VQVAETemporalUpsample.forward   s    8E8K8K5
h%--aAq!<GGINNz[]_ghm#IV%**:PRS[[\]_`bcefhijuuw		-0r0   r   r   rR   s   @r.   r   r      s/    

 
U\\  r0   r   c                   V   ^  \ rS rSrS\S\4U 4S jjrS\R                  4S jrSr	U =r
$ )Emu3VQVAETemporalDownsample   r   r   c                 D   > [         TU ]  5         [        UUSSS9U l        g )N)rc   r   r   )r   r   r   r   r   r   s      r.   r&   $Emu3VQVAETemporalDownsample.__init__   r   r0   r1   c                 (    U R                  U5      nU$ r$   r   r   s     r.   rD   #Emu3VQVAETemporalDownsample.forward  s    		-0r0   r   r   rR   s   @r.   r   r      s/    

 
U\\  r0   r   c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )Emu3VQVAETemporalResnetBlocki  c                 f  > [         TU ]  5         Xl        Uc  UOUU l        [        R
                  " U5      U l        [        UUSSS9U l        [        R
                  " U5      U l	        [        UUSSS9U l
        U R                  U R                  :w  a  [        R                  " UUSSSS9U l        g g )Nr   r   r   r   r   r   )r%   r&   r   r   r'   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r.   r&   %Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r0   c                 P   UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nX!-   $ r$   )	r   rK   sigmoidr   r   r   r   r   r   )r,   r1   rA   s      r.   rD   $Emu3VQVAETemporalResnetBlock.forward,  s     

=1}55

=1

=1}55

=1t000((2H''r0   )r   r   r   r   r   r   r   r$   r   rR   s   @r.   r   r     s     @( (r0   r   c                      ^  \ rS rSr  S	S\S\\   S\\   4U 4S jjjrS
S\R                  S\\R                     4S jjr	Sr
U =r$ )Emu3VQVAEResnetBlocki<  r   r   quant_channelsc                   > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Uc9  [
        R                  " USSSS9U l        [
        R                  " USSSS9U l        O [        X15      U l        [        X25      U l        [
        R                  " UUSSSS9U l        [
        R                  " UUSSSS9U l        U R                  U R                  :w  a  [
        R                  " UUSSSS9U l        g g )	Nr   r   Tr   r   r   r   r   )r%   r&   r   r   r   r'   r   r   r   r   r   r   r   r   )r,   r   r   r   r-   s       r.   r&   Emu3VQVAEResnetBlock.__init__=  s     	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nJDJ-nKDJYY

 YY

 t000 "		!D 1r0   r1   c                 |   U R                   c  SOU4nUnU R                  " U/UQ76 nU[        R                  " U5      -  nU R	                  U5      nU R
                  " U/UQ76 nU[        R                  " U5      -  nU R                  U5      nU R                  U R                  :w  a  U R                  U5      nXA-   $ Nr;   )
r   r   rK   r   r   r   r   r   r   r   )r,   r1   r   	norm_argsrA   s        r.   rD   Emu3VQVAEResnetBlock.forwardi  s    --5BN;L	 

==9=}55

=1

==9=}55

=1t000((2H''r0   )r   r   r   r   r   r   r   r   )NNr$   )rF   rG   rH   rI   rJ   r   r&   rK   rL   rD   rP   rQ   rR   s   @r.   r   r   <  s_     '+(,	** sm* !	* *X(U\\ (8ELLCY ( (r0   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Emu3VQVAEAttentionBlocki{  r!   c                 2   > [         TU ]  U5        SU l        g )Nr   )r%   r&   num_key_value_groupsr_   s     r.   r&    Emu3VQVAEAttentionBlock.__init__|  s      %&!r0   )r   )rF   rG   rH   rI   r   r&   rP   rQ   rR   s   @r.   r   r   {  s    & & &r0   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )Emu3VQVAEGroupNormi  z
Same as the torch GroupNorm with the only difference that this ones accepts
an optional kwarg `quant_states` which is not used. This class makes it easier to
use SpatialNorm or GroupNorm without conditionals
c                 &   > [         TU ]  " S0 UD6  g r   )r%   r&   )r,   r@   r-   s     r.   r&   Emu3VQVAEGroupNorm.__init__  s    "6"r0   c                     [         R                  " XR                  U R                  U R                  U R
                  5      $ r$   )r   
group_normr   r\   biasr   )r,   inputr   s      r.   rD   Emu3VQVAEGroupNorm.forward  s'    ||E??DKKDHHUUr0   r;   r$   )	rF   rG   rH   rI   r{   r&   rD   rP   rQ   rR   s   @r.   r   r     s    #V Vr0   r   c                   p   ^  \ rS rSrSU 4S jjrSS\R                  S\\R                     4S jjrSr	U =r
$ )Emu3VQVAEMiddleBlocki  c                    > [         TU ]  5         [        UUUS9U l        [	        U5      U l        Uc  [        USSSS9U l        O[        X25      U l        [        UUUS9U l	        g )Nr   r   r   r   r   Tr   )
r%   r&   r   block_1r   attn_1r   	attn_normr   block_2)r,   r!   r   r   r-   s       r.   r&   Emu3VQVAEMiddleBlock.__init__  sm    +#$)

 .f5!/[UW]ajnoDN1.NDN+#$)
r0   r1   r   c                 N   U R                  X5      nUnU R                  X5      nUR                  u  pEpgUR                  XEXg-  5      R	                  SS5      nU R                  U5      S   nUR                  XFXu5      R                  SSSS5      nX1-   nU R                  X5      nU$ )Nr   r   r   r   )	r   r   rh   rk   rn   r   reshaperi   r   )r,   r1   r   rA   rp   rr   rs   rt   s           r.   rD   Emu3VQVAEMiddleBlock.forward  s    ]A }C.;.A.A+
f%**:PZZ[\^_`M215%--j%RZZ[\^_abdef 0]Ar0   )r   r   r   r   r$   )rF   rG   rH   rI   r&   rK   rO   r   rD   rP   rQ   rR   s   @r.   r   r     s1    
(
U%6%6 
huO`O`Fa 
 
r0   r   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEDownBlocki  c                   > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nS[        U5      -   nX@l        [        R                  " 5       U l        [        U R                  5       GHL  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nX$U   -  n	X#U   -  n
[        U R
                  5       H~  nUR                  [        U	U
S95        U
n	UR                  c  M-  XQR                  ;   d  M>  UR                  [!        U5      5        UR                  [        R"                  " U	SSSS95        M     [        R$                  " 5       nXll        X|l        Xl        XPR                  S-
  :w  a  [-        U	5      Ul        U R                  R                  U5        GMO     g )N)r   r   r   r   r   Tr   r   )r%   r&   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelstuplein_channel_multiplierr'   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsr}   
downsample)r,   r!   r  r  r  i_levelr  r  r  block_in	block_outi_blockr  r-   s                r.   r&   Emu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112GMMOE==?DJ$W'EEH%7(CCI !4!45($,%. %**67F]F];]KK 7 ?@%%bllUW]ajn&op 6 99;DJI(O..22"@"JIIT"1 3r0   r1   c                 <   [        U R                  5       GH  u  p#[        U R                  5       H  nUR                  U   " U5      n[        UR                  5      S:  d  M3  UnUR                  U   " U5      nUR                  u  pgpUR                  XgX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XhX5      R                  SSSS5      nXQ-   nM     X R                  S-
  :w  d  M  UR                  U5      nGM     U$ )Nr   r   r   r   )	enumerater  r  r
  r  r  r  r  rh   rk   rn   r  ri   r	  r  )
r,   r1   r  blocksr  rA   rp   rr   rs   rt   s
             r.   rD   Emu3VQVAEDownBlock.forward  s   (3OG !4!45 &W 5m Dv{{#a',H$*$5$5g$>}$MM:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M 6 ..22 & 1 1- @  4" r0   )r  r  r
  r	  
rF   rG   rH   rI   r&   rK   rO   rD   rP   rQ   rR   s   @r.   r  r    s     ##JU%6%6  r0   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Emu3VQVAEUpBlocki  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  UR                  S   -  n[        R                  " 5       U l
        [        [        U R                  5      5       GH8  n[        R                  " 5       n[        R                  " 5       n[        R                  " 5       nUR                  UR                  U   -  n[        U R
                  S-   5       Hd  n	UR                  [        UUUS95        UnXAR                  ;   d  M0  UR                  [!        U5      5        UR                  [#        X#5      5        Mf     [        R$                  " 5       n
XZl        Xjl        Xzl        US:w  a  [-        U5      U
l        U R                  R1                  SU
5        GM;     g )Nrd   r   r   r   )r%   r&   r  r  r	  r
  rZ   r  r'   r  upreversedr  r  r   r  r   r   r  r  r  r  r   upsampleinsert)r,   r!   r   r  r  r  r  r  r  r  r%  r-   s              r.   r&   Emu3VQVAEUpBlock.__init__  si   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;<GMMOE==?DJ,,v/H/H/QQI !4!4q!89($,%.'5 %555KK 7 ?@%%&:>&TU : BHG&M!|:8DGGNN1b!3 =r0   r1   r   c                 b   [        U R                  S S S2   5       GH  u  p4[        U R                  S-   5       H  nUR                  U   " X5      n[        UR                  5      S:  d  M3  UnUR                  U   " X5      nUR                  u  pxpUR                  XxX-  5      R                  SS5      nUR                  U   " U5      S   nUR                  XyX5      R                  SSSS5      nXa-   nM     U[        U R                  5      S-
  :w  d  M  UR                  U5      nGM     U$ )Nrd   r   r   r   r   )r  r%  r  r
  r  r  r  r  rh   rk   rn   r  ri   r'  )r,   r1   r   r  r  r  rA   rp   rr   rs   rt   s              r.   rD   Emu3VQVAEUpBlock.forward  s   (27OG !4!4q!89 &W 5m Rv{{#a',H$*$5$5g$>}$[M:G:M:M7J&$1$6$6zV^$\$f$fghjk$lM$*KK$8$G$JM$1$9$9*e$^$f$fghjkmnpq$rM$,$<M : #dgg,** & >  8  r0   )r
  r	  r%  r!  rR   s   @r.   r#  r#    s-    #"JU%6%6 eFWFW  r0   r#  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )Emu3VQVAEEncoderi(  c                   > [         TU ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nU(       a  SU-  OUnX&S   -  n[        R                  R                  X2SSSS9U l
        [        U5      U l        [        X5      U l        [        R                  R                  SUSSS	9U l        [        R                  R                  UUSSSS9U l        [%        [&        R(                  " UR*                  5      5      n	[        R,                  " 5       U l        [        R,                  " 5       U l        [3        U	5       H)  n
[5        Xw5      nU R.                  R7                  U5        M+     [3        UR8                  5       H(  n[;        UUS
9nU R0                  R7                  U5        M*     g )Nr   rd   r   r   r   r   r   T)r   r   r   r   r  )r%   r&   r  r   double_latentlatent_channelsr  rK   r'   r   conv_inr  
down_blockr   middle_blockr   norm_outconv_outrJ   mathlog2temporal_downsample_factorr  	time_convtime_res_stackr  r   r  r
  r   )r,   r!   r  r   r/  r0  r  r   r  temporal_down_blocksir   _time_res_convr-   s                 r.   r&   Emu3VQVAEEncoder.__init__)  s   ,,((,, 00#66.;q?* b#99xx{qYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+,A.|JDNN!!$' - v,,-A8()M &&}5 .r0   pixel_valuesc                 t   UR                   S   nUR                  " S/UR                   SS  Q76 nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU[        R                  " U5      -  nU R                  U5      nUR                  " SU/UR                   SS  Q76 nUR                  SSSSS5      nU R                   H$  nU" U5      nU[        R                  " U5      -  nM&     U R                   H  nU" U5      nM     UR                  SSSSS5      nU$ )Nr   rd   r   r   r   rc   )rh   r  r1  r2  r3  r4  rK   r   r5  ri   r9  r:  )r,   r@  temporal_dimr1   r   layers         r.   rD   Emu3VQVAEEncoder.forwardP  s:   #))!,#++BH1C1CAB1GH \26))-8 m4}55m4%--b,YATATUVUWAXY%--aAq!< NND /MU]]=99M # ((E!-0M ) &--aAq!<r0   )r1  r5  r2  r3  r4  r9  r:  )
rF   rG   rH   rI   r&   rK   rM   rD   rP   rQ   rR   s   @r.   r-  r-  (  s     %6NE$4$4  r0   r-  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Emu3VQVAEDecoderin  r!   c                   > [         T	U ]  5         UR                  nUR                  UR                  S   -  n[
        R                  " 5       U l        [        UR                  5       H<  n[        UR                  UR                  S9nU R                  R                  U5        M>     [        [        R                  " UR                   5      5      n[
        R                  " 5       U l        [        U5       H>  n[%        UR                  UR                  5      nU R"                  R                  U5        M@     [
        R&                  " UR                  USSSS9U l        [+        XUS9U l        [/        U5      U l        UR                  UR                  S   -  n[3        X#5      U l        [
        R&                  " UUR6                  SSSS9U l        g )Nrd   r  r   r   r   )r   r   )r%   r&   rZ   r  r  r'   r  r:  r  r
  r   r0  r  rJ   r6  r7  r8  r9  r   r   r1  r   r3  r#  up_blockr   r4  r   r5  )
r,   r!   r   r  r=  r>  temp_upsample_block_numr<  r   r-   s
            r.   r&   Emu3VQVAEDecoder.__init__o  s|   ))''&*C*CB*GG mmov,,-A8"22AWAWM &&}5	 . #&dii0Q0Q&R"S./A,V-C-CVE[E[\DNN!!$' 0 yy""
 1R`a(0''&*C*CA*FF,^F		
r0   r1   r   c                    [         R                  " X4SS9nUR                  SSSSS5      nU R                   H  nU" U5      nM     U R                   H$  nU" U5      nU[         R
                  " U5      -  nM&     UR                  SSSSS5      n[         R                  " USSS9u  pUR                  " S/UR                  SS  Q76 nUR                  " S/UR                  SS  Q76 nU R                  U5      nU R                  X5      nU R                  X5      nU R                  X5      nU[         R
                  " U5      -  nU R                  U5      nU$ )Nr   rg   r   r   r   rc   rd   )rK   catri   r:  r9  r   chunkr  rh   r1  r3  rH  r4  r5  )r,   r1   r   hidden_quant_statesrC  s        r.   rD   Emu3VQVAEDecoder.forward  sV   #ii(E1M199!Q1aH ((E"'(;"< ) ^^E"'(;"<5==1D#EE $ 299!Q1aH&+kk2Eqa&P#%--bK=3F3Fqr3JK#++BH1C1CAB1GH]3 ))-FmBmB}55m4r0   )r1  r5  r3  r4  r9  r:  rH  )rF   rG   rH   rI   r   r&   rK   rL   rD   rP   rQ   rR   s   @r.   rF  rF  n  s0    %
 %
NU\\   r0   rF  aF  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                      ^  \ rS rSr\rSrSrSrSr	Sr
Sr/ SQrS rS\4U 4S jjrS\R                   S	\R                   4S
 jrS\R                   4S jrSrU =r$ )	Emu3VQVAEi  
emuvideovqr@  T)r   r   r   rT   c                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  SSS9  UR                  bq  [        R                  R                  UR                  5      u  p#S[        R                  " U5      -  n[        R                  R                  UR                  U* U5        g g [        U[        R                  5      (       a  [        R                  R                  UR                  [        R                  " S5      S9  UR                  by  [        R                  R                  UR                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  R                  UR                  U* U5        g g [        U[        R                  [        R                  [        R                   45      (       aU  [        R                  R#                  UR                  S5        [        R                  R#                  UR                  S	5        g [        U[        R$                  5      (       ad  UR                  R&                  R)                  5         UR*                  b2  UR                  R&                  UR*                     R-                  5         g g g )
Nfan_outrelu)r   nonlinearityr      )ar   rW           )
isinstancer'   r   r   initkaiming_normal_r\   r   _calculate_fan_in_and_fan_outr6  sqrtr^   Linearkaiming_uniform_BatchNorm2dr   r   	constant_rX   r]   normal_padding_idxzero_)r,   modulefan_inr=  bounds        r.   _init_weightsEmu3VQVAE._init_weights  s   fryy"))455GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		**GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOOGGfmmS1GGfkk3/--MM&&(!!-""6#5#56<<> . .r0   r!   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        S[        UR                  5      S-
  -  U l        [        UR                  UR                  SSS9U l        [        UR                  UR                  SSS9U l        S[        UR                  5      S-
  -  U l        U R%                  5         U R'                  5         g )Nr   r   )r   r   r   r   r   )r%   r&   r!   r-  encoderrF  decoderrT   quantizer  r  vision_spatial_factorr   r0  rZ   
quant_convpost_quant_convspatial_scale_factoreval	post_initr_   s     r.   r&   Emu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r0   image_sizesc                    UR                   S:H  nU(       aJ  U R                  R                  nUR                  u  pVpxUR	                  S5      R                  SUSSS5      nOUR                  u  pTpgnU R                  U5      n	U	R                  SSSSS5      n	U R                  U	5      n	U	R                  SSSSS5      n	U R                  U	5      n
U(       a  U
R                  S5      OU
n[        X5       VVs/ s HB  u  pUS [        US   U R                  -  5      2S [        US   U R                  -  5      24   PMD     nnnU$ s  snnf )Nrc   r   r   r   r   )ndimr!   r8  rh   	unsqueezerepeatrm  ri   rq  ro  squeezer   rJ   rp  )r,   r@  rw  is_imagerq   rp   rr   rs   rt   r1   codesimage_tokenssingle_imager   s                 r.   encodeEmu3VQVAE.encode  sP   $$){{==H2>2D2D/J&'11!4;;AxAqQL<H<N<N9J(E\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
&D" D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr&D 	 

 
s   6A	Er1   c                    UR                   S:H  nU(       a  UR                  S5      nUR                  u  p4pVU R                  R	                  UR                  5       5      nUR                  S   nUR                  X4XVU5      R                  SSSSS5      R                  5       nU R                  U5      n	UR                  SSSSS5      nU	R                  SSSSS5      n	U R                  X5      n
U
R                  UX@R                  R                  -  U R                  R                  XPR                  -  X`R                  -  5      n
U(       a	  U
S S 2S4   $ U
$ )Nr   r   rd   r   rc   r   )ry  rz  rh   ro  r[   flattenrk   ri   rj   rr  rn  r  r!   r8  r   rs  )r,   r1   r}  rp   rq   rs   rt   quantrr   
post_quantvideos              r.   decodeEmu3VQVAE.decode  s;    %%*)33A6M.;.A.A+
f''(=(=(?@;;r?

:IQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/{{===KK$$...---
 'uQT{1E1r0   )r!   rn  rm  rr  rq  ro  rs  rp  )rF   rG   rH   rI   r   config_classbase_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backend_no_split_modulesrj  r&   rK   rL   r  r  rP   rQ   rR   s   @r.   rR  rR    su     #L$$ON!"&?* *5<< ell 82ELL 2 2r0   rR  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\\R                     S\R                  4S jrS
\R                  S\R                  4S jrSrg)Emu3ImageVocabularyMappingi*  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 h    Xl         UR                  S5      U l        UR                  S5      U l        g )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r,   r  s     r.   r&   #Emu3ImageVocabularyMapping.__init__/  s)    "%MM/:'mmI6r0   c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf Nz<|visual tokensortedr  items
startswithr,   namevals      r.   r  'Emu3ImageVocabularyMapping.image_tokens4  s<    DNN,@,@,Bh,BytdooVfFgs,Bhiih   A
A
c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf r  r  r  s      r.   image_tokens_str+Emu3ImageVocabularyMapping.image_tokens_str8  s<    T^^-A-A-Ci-C	tWgGht-Cijjir  c                 z    U R                    Vs0 s H  n[        USS 5      U R                  U   _M!     sn$ s  snf )Nir   )r  rJ   r  )r,   tokens     r.   img2bpe"Emu3ImageVocabularyMapping.img2bpe<  s;    FJF[F[\F[UE"RL!4>>%#88F[\\\s   &8c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r$   )r  r  )r,   kvs      r.   bpe2img"Emu3ImageVocabularyMapping.bpe2img@  s-    !%!3!3!56!5!5666s   0c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ Nr   dtype)rK   zerosmaxr  keysrJ   r  r,   mappingr  r  s       r.   bpe2img_mapping_tensor1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorD  R    ++c$,,"3"3"56:%))LLL&&(DAAJ )r0   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ r  )rK   r  r  r  r  rJ   r  r  s       r.   img2bpe_mapping_tensor1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorK  r  r0   	img_batchr9   c                 "   UR                   n[        R                  " UR                  S   S4[        R                  S9U R
                  -  nU R                  UR                  S5         n[        R                  " XC/SS9nUR                  U5      $ )Nr   r   r  cpurd   rg   )	devicerK   onesrh   rJ   r  r  torL  )r,   r  r  eol_row
img_tokenss        r.   convert_img2bpe*Emu3ImageVocabularyMapping.convert_img2bpeR  su    !!**iooa0!4EIIFIZIZZ00e1DE
YY
4"=
}}V$$r0   c                     UR                   nUSS S24   nU R                  UR                  S5         nUR                  U5      $ )N.rd   r  )r  r  r  )r,   r  r  r  s       r.   convert_bpe2img*Emu3ImageVocabularyMapping.convert_bpe2imgY  sG    !!c3B3h'	00e1DE
}}V$$r0   )r  r  r  N)rF   rG   rH   rI   r{   r&   r   r  r  r  r  r  r  r   rK   rL   r  r  rP   r;   r0   r.   r  r  *  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r0   r  c                   (    \ rS rSrS/rSrSrS rSrg)Emu3PreTrainedModeli`  r   Tc                    U R                   R                  5       R                  n[        U[        R
                  [        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g g )NrZ  )meanstdrW   )r!   get_text_configinitializer_ranger[  r'   r`  r   r\   r]   rd  r   rf  rX   re  Emu3RMSNormfill_)r,   rg  r  s      r.   rj  !Emu3PreTrainedModel._init_weightsg  s    kk))+==fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .,,MM$$S) -r0   r;   N)	rF   rG   rH   rI   r  r  r  rj  rP   r;   r0   r.   r  r  `  s      "&*r0   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Emu3TextModeliu  r!   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r$   )r%   r&   r'   r  r  num_hidden_layersr   layersr+   s      r.   r&   Emu3TextModel.__init__v  sH     mmBGH`H`BabBaYf0Bab
bs   A)r  )rF   rG   rH   rI   r   r&   rP   rQ   rR   s   @r.   r  r  u  s    
z 
 
r0   r  c                   8   ^  \ rS rSr\rU 4S jrU 4S jrSrU =r	$ )Emu3ForCausalLMi}  c                 D   > [         TU ]  U5        [        U5      U l        g r$   )r%   r&   r  modelr_   s     r.   r&   Emu3ForCausalLM.__init__  s     "6*
r0   c                  6   > [        5       R                  5         g)aI  
Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```N)r%   rD   )super_kwargsr-   s    r.   rD   Emu3ForCausalLM.forward  s    $ 	r0   )r  )
rF   rG   rH   rI   r   r  r&   rD   rP   rQ   rR   s   @r.   r  r  }  s    !L+ r0   r  c            !       @  ^  \ rS rSrSS0rSrU 4S jrS rS rS\	R                  S	\	R                  4S
 jrS\	R                  S	\	R                  4S jr\	R                  S\	R                  S\S\4S j5       r\\            SS\	R                  S\	R                  S	\	R&                  S\\	R&                     S\\	R                     S\\   S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )	Emu3Modeli  ztext_model.model
text_modelFc                   > [         TU ]  U5        [        R                  UR                  5      U l        U R
                  R                  b/  U R
                  R                   Vs/ s H  nSU 3PM
     snU l        [        UR                  5      U l	        [        UR                  5      U l        U R                  5         g s  snf )Nztext_model.)r%   r&   r  _from_configtext_configr  _tied_weights_keysrR  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingru  )r,   r!   r  r-   s      r.   r&   Emu3Model.__init__  s     '44V5G5GH??--9BF//BdBd&eBdQQC'8Bd&eD# !1!12"<V=R=R"S 	 'fs   $C c                 6    U R                   R                  5       $ r$   )r  get_input_embeddingsr,   s    r.   r  Emu3Model.get_input_embeddings  s    3355r0   c                 :    U R                   R                  U5        g r$   )r  set_input_embeddingsr,   values     r.   r  Emu3Model.set_input_embeddings  s    ,,U3r0   r@  rw  c                 N    [         R                  S5        U R                  U5      $ )Nz`model.get_image_tokens()` is deprecated and will be removed in v4.58. To obtain discrete token use `model.get_image_features()`)loggerwarningget_image_featues)r,   r@  rw  s      r.   get_image_tokensEmu3Model.get_image_tokens  s'     O	
 %%l33r0   c                     U R                   R                  X5      nU Vs/ s H+  o@R                  R                  U5      R	                  5       PM-     nn[
        R                  " U5      nU$ s  snf )a  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
        The sizes of the images in the batch, being (height, width) for each image.
)r  r  r  r  r  rK   rL  )r,   r@  rw  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r.   get_image_featuresEmu3Model.get_image_features  sb     !LL//JctuctY_22BB6JRRTctuYY/
 vs   2A,r  rs   rt   c                     USS2SS24   R                  SX#S-   5      nU R                  R                  U5      nU R                  R	                  U5      nU$ )a  
Decodes generated image tokens from language model to continuous pixel values
with VQGAN module via upsampling.

Args:
    image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
        The tensors corresponding to the input images.
    height (`int`):
        Height of the generated image before upsampling.
    width (`int`):
        Width of the generated image before upsampling.
Nrd   r   )rk   r  r  r  r  )r,   r  rs   rt   	sequencesimages         r.   decode_image_tokensEmu3Model.decode_image_tokens  sV     !CRC(--b&!)D	..>>yI##L1r0   	input_idsr2   r3   past_key_valuesinputs_embedsr6   r5   output_hidden_statesreturn_dictr7   r@   r9   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUSL USL-  (       a  [	        S5      eUb  Ub  [	        S5      eUb`  U R                  X#5      nXR                  R                  :H  nUR                  UR                  UR                  5      nUR                  X5      nU R                  " SUUUUUUU	U
SUS.
UD6nU$ )aH  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT
r  r2   r3   r  r  r6   r5   r  r  r7   r;   )r!   r5   r  use_return_dict
ValueErrorr  r  r  r  r  r  masked_scatterr  )r,   r  r@  rw  r2   r3   r  r  r6   r5   r  r  r7   r@   r  special_image_maskrC   s                    r.   rD   Emu3Model.forward  s-   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<s  #(Av  #22<ML!*.E.E.T.T!T'??9+;+;Y__ML!001CRI // 
)%+'/!5)
 
 r0   )r  r  r  r  )NNNNNNNNNNNN)rF   rG   rH   rI   _checkpoint_conversion_mapping_supports_static_cacher&   r  r  rK   rO   rM   r  r  no_gradrJ   r  r   r   rL   r   r	   rN   r   r   r   r   r   rD   rP   rQ   rR   s   @r.   r  r    s   &8,%G""
644U->-> 4UM]M] 4u/@/@ uO_O_ " ]]0@0@ # VY  $  '+*.$(1537+/59$(,0/3&*59;##; ''; \\	;
 !.; u//0; "%;   1 12; D>; $D>; 'tn; d^; !!1!12; -.; 
u,,	-;  ;r0   r  c            %       v  ^  \ rS rSrSrSSSS.rSrU 4S jrS	 rS
 r	\
S 5       r\
S 5       r\\              S%S\R                   S\R"                  S\R$                  S\\R$                     S\\R                      S\\   S\\R"                     S\\   S\\   S\\   S\\   S\\R                      S\\R                      S\\\R$                  4   S\\   S\\\4   4 S jj5       5       r       S&U 4S jjr\S\R$                  S\S \S!\R>                  S\R$                  S"\4S# j5       r S$r!U =r"$ )'Emu3ForConditionalGenerationi   zmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headFc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r   )r%   r&   r  r  r'   r`  r  hidden_size
vocab_sizer  ru  r_   s     r.   r&   %Emu3ForConditionalGeneration.__init__"  sS     v&
yy!3!3!?!?ASASA^A^ejkr0   c                 6    U R                   R                  5       $ r$   )r  r  r  s    r.   r  1Emu3ForConditionalGeneration.get_input_embeddings)  s    zz..00r0   c                 :    U R                   R                  U5        g r$   )r  r  r  s     r.   r  1Emu3ForConditionalGeneration.set_input_embeddings,  s    

''.r0   c                 .    U R                   R                  $ r$   )r  r  r  s    r.   r  'Emu3ForConditionalGeneration.text_model0  s    zz$$$r0   c                 .    U R                   R                  $ r$   )r  r  r  s    r.   r  $Emu3ForConditionalGeneration.vqmodel4  s    zz!!!r0   r  r@  rw  r2   r3   r  r  r6   r5   r  r  r7   labelslogits_to_keepr@   r9   c                 ,   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUU	U
SUS.
UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )aL  
image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
    The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
    [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
    [`Emu3ImageProcessor`] for processing images).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
>>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

>>> conversation = [
...     {
...     "role": "system",
...     "content": [
...         {"type": "text", "text": "You are a helpful assistant."},
...         ],
...     },
...     {
...     "role": "user",
...     "content": [
...         {"type": "image"},
...         {"type": "text", "text": "Please describe the image."},
...         ],
...     },
... ]

>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
>>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

>>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```NTr  r   )logitsr,  r"  )lossr/  r  r1   
attentionsr;   )r!   r5   r  r  r  r[  rJ   slicer  loss_functionr  r"  r   r  r1   r1  )r,   r  r@  rw  r2   r3   r  r  r6   r5   r  r  r7   r,  r-  r@   rC   r1   slice_indicesr/  r0  s                        r.   rD   $Emu3ForConditionalGeneration.forward8  s>   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
)%+'/!5)
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD &#33!//))
 	
r0   c	                 V   > [         TU ]  " U4UUUUUUUS.U	D6n
US   S:w  a  S U
S'   U
$ )N)r  r2   r  r7   r3   r@  r6   r   r@  )r%   prepare_inputs_for_generation)r,   r  r  r2   r  r7   r3   r6   r@  r@   model_inputsr-   s              r.   r7  :Emu3ForConditionalGeneration.prepare_inputs_for_generation  sZ     w<

+)')%%

 

 !!+/L(r0   sequence_lengthtarget_lengthr  rp   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nrc   )
fill_valuer  r  r   )diagonal)r  rd   r   )re   rK   finfominfullr  triuaranger  expandclonerh   r  masked_fill)r2   r:  r;  r  r7   rp   r@   causal_mask	min_dtypemask_lengthpadding_masks              r.   5_prepare_4d_causal_attention_mask_with_cache_positionREmu3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r0   )r  r  )NNNNNNNNNNNNNr   )NNNNNTN)#rF   rG   rH   rI   r  r  r  r&   r  r  propertyr  r  r   r   rK   rM   rO   rL   r   r	   rN   r   rJ   r   r   r   r   rD   r7  staticmethodr  rK  rP   rQ   rR   s   @r.   r  r    s>   /#(&"
 #1/ % % " "  '+*.$(1537+/59$(,0/3&*59-134d
##d
 ''d
 \\	d

 !.d
 u//0d
 "%d
   1 12d
 D>d
 $D>d
 'tnd
 d^d
 !!1!12d
 ))*d
 c5<</0d
  *+!d
" 
u,,	-#d
  d
R > 444 4 {{	4
 4 4 4r0   r  )r  r  r  r  rR  r  )Kr6  	functoolsr   typingr   r   r   r   rK   torch.nnr'   torch.nn.functional
functionalr   torch.utils.checkpointcache_utilsr	   
generationr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerrF   r  r   r  rT   r}   r   r   r   r   r   r   r   r   r   r   r   r  r#  r-  rF  rR  r  r  r  r  r  r  __all__r;   r0   r.   <module>rb     s  "  % / /        ) B 6 - & > > h e e 4 K K 
		H	%A( AH$ryy $D	%H 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~&o &V V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l*2I **
J 3 
&(;_ 8}# }@\#6 \~r0   