
    fTh                     H   S r SSKJr  SSKJrJrJrJr  SSKrSSK	J
s  Jr  SSKrSSKJ
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+J,r,  \'" 5       (       a  SSK-J.r.  SSK/J0r0  \)Rb                  " \25      r3 " S S\
Rh                  5      r5\"Rl                  " \55         " S S\
Rh                  5      r7 " S S\75      r8 " S S\75      r9S r:SOS jr; " S S \
Rh                  5      r< " S! S"\
Rz                  5      r>S#\R~                  S$\@S%\R~                  4S& jrA SPS'\
Rh                  S(\R~                  S)\R~                  S*\R~                  S+\\R~                     S,\BS-\B4S. jjrC " S/ S0\
Rh                  5      rD " S1 S2\
Rh                  5      rE " S3 S4\
Rh                  5      rF " S5 S6\
Rh                  5      rG " S7 S8\
Rh                  5      rH " S9 S:\
Rh                  5      rI " S; S<\
Rh                  5      rJ " S= S>\
Rh                  5      rK " S? S@5      rL\% " SA SB\5      5       rM\%" SCSD9 " SE SF\M5      5       rN\% " SG SH\M5      5       rO " SI SJ\\$5      rP\%" SKSD9 " SL SM\M\5      5       rQ/ SNQrRg)QzPyTorch Chameleon model.    )cached_property)CallableOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )ChameleonConfigChameleonVQVAEConfig)	BlockMask)make_flex_block_causal_maskc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )ChameleonRMSNorm7   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z/
ChameleonRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/chameleon/modeling_chameleon.pyr&   ChameleonRMSNorm.__init__8   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   hidden_statesinput_dtypevariances       r0   forwardChameleonRMSNorm.forward@   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler*   shaper+   r,   s    r0   
extra_reprChameleonRMSNorm.extra_reprG   s*    ))*+6$2G2G1HIIr2   )r+   r*   )ư>)	__name__
__module____qualname____firstlineno__r&   r@   rF   __static_attributes____classcell__r/   s   @r0   r"   r"   7   s    $;J Jr2   r"   c                   Z   ^  \ rS rSrSU 4S jjr\R                  " 5       S 5       rSrU =r	$ )ChameleonRotaryEmbeddingP   c           	      P  > [         TU ]  5         XPl        Xl        X l        X0l        SU R
                  [        R                  " SU R                  S[        R                  S9R                  U[        R                  S9U R                  -  -  -  nU R                  SUSS9  X l        g )	N      ?r   r4   r7   devicer7   inv_freqF
persistent)r%   r&   scaling_factordimmax_position_embeddingsbaser(   arangeint64r8   floatregister_buffermax_seq_len_cached)r,   r\   r]   r^   rW   r[   rX   r/   s          r0   r&   !ChameleonRotaryEmbedding.__init__Q   s    ,'>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD"9r2   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       nUR                  R
                  n[        U[        5      (       a  US:w  a  UOSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       nUR                  5       n	S S S 5        WR                  UR                  S
9W	R                  UR                  S
94$ ! , (       d  f       N@= f)Nr   r5   r   mpscpuF)device_typeenabledr4   r\   rU   )rX   ra   expandrD   rW   type
isinstancestrr(   autocast	transposecatcossinr8   r7   )
r,   xposition_idsinv_freq_expandedposition_ids_expandedrh   freqsembrr   rs   s
             r0   r@    ChameleonRotaryEmbedding.forward_   s!    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @ hhmm%/S%A%AkUZFZk`e^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')C'')C	 D
 vvAGGv$cff177f&;;; DCs   %A(E
E)r^   r\   r]   rc   r[   )i   i'  NrT   )
rI   rJ   rK   rL   r&   r(   no_gradr@   rM   rN   rO   s   @r0   rQ   rQ   P   s"    : ]]_< <r2   rQ   c                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ )%ChameleonLinearScalingRotaryEmbeddingp   z_ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendevc                 f   > UR                  5       U R                  -  n[        TU ]  X5      u  p4X44$ N)ra   r[   r%   r@   )r,   rt   ru   rr   rs   r/   s        r0   r@   -ChameleonLinearScalingRotaryEmbedding.forwards   s3    #))+d.A.AA7?13xr2    rI   rJ   rK   rL   __doc__r@   rM   rN   rO   s   @r0   r}   r}   p   s    i r2   r}   c                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ ))ChameleonDynamicNTKScalingRotaryEmbeddingz   zqChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozillac           	        > [         R                  " U5      S-   nX0R                  :  a  U R                  U R                  U-  U R                  -  U R                  S-
  -
  U R
                  U R
                  S-
  -  -  -  nSU[         R                  " SU R
                  S[         R                  S9R                  UR                  [         R                  S9U R
                  -  -  -  nU R                  SUSS	9  [        TU ]5  X5      u  pgXg4$ )
Nr   r4   rT   r   rU   rV   rX   FrY   )r(   maxr]   r^   r[   r\   r_   r`   r8   rW   ra   rb   r%   r@   )	r,   rt   ru   seq_lenr^   rX   rr   rs   r/   s	           r0   r@   1ChameleonDynamicNTKScalingRotaryEmbedding.forward}   s    ))L)A-11199$$w.1M1MMRVReRehiRij((dhhl+ - -D LLDHHau{{CFFahh^c^i^iFjmqmumuuwH   X% H7?13xr2   r   r   rO   s   @r0   r   r   z   s    { r2   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr5   r4   rj   )rD   r(   rq   )rt   x1x2s      r0   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrr   rs   ru   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ChameleonMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)r%   r&   configr-   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnr,   r   r/   s     r0   r&   ChameleonMLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r2   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r   r   r   r   )r,   rt   r   s      r0   r@   ChameleonMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r   r   r   r   r-   r   r   rI   rJ   rK   rL   r&   r@   rM   rN   rO   s   @r0   r   r      s    0 r2   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ChameleonLayerNorm   ar  
LayerNorm but computes stats only over the last dim because Chameleon applies gamma and beta
from each shard separately to each head, instead of reducing. We can apply each head's own
gamma/beta by repeat-interleaving weights from each shard, but the stats have to be computed
in the last dimension. This module applies gamma/beta manually to fulfill this requirement.
c                 D   > [         TU ]  " U/UQ70 UD6  US   4U l        g )Nr5   )r%   r&   normalized_shape)r,   r-   argskwargsr/   s       r0   r&   ChameleonLayerNorm.__init__   s)    6t6v6!,R 2r2   c                 ~    [         R                  " XR                  S S SS9nXR                  -  U R                  -   nU$ )Ngh㈵>r.   )F
layer_normr   r*   r   r,   r=   s     r0   r@   ChameleonLayerNorm.forward   s9    ]4I4I4QU[_`%3dii?r2   )r   )	rI   rJ   rK   rL   r   r&   r@   rM   rN   rO   s   @r0   r   r      s    3 r2   r   r=   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rD   rk   reshape)r=   r   batchnum_key_value_headsslenhead_dims         r0   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr2   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr4   r	   r5   )r\   r7   )ptrainingr   )r   num_key_value_groupsr(   matmulrp   rD   r   
functionalsoftmaxr9   r8   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r0   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r2   c                   D  ^  \ rS rSrSrSS\S\\   4U 4S jjjrS r	      SS\
R                  S\\
R                     S	\\
R                     S
\\   S\S\S\\
R                     S\\
R                  \\
R                     \\\
R                        4   4S jjrSrU =r$ )ChameleonAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr   	layer_idxc                   > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR                   U l        SU l        UR$                  U l        U R                  S-  U l        U R                  U R                  -  U R                  :w  a&  [)        SU R                   SU R                   S35      e[*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  U R                  -  UR.                  S9U l        [*        R,                  " U R                  U R                  UR.                  S9U l        [9        U R                  U R                  45      U l        [9        U R                  U R                  45      U l        U R?                  5         g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.T      z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   ) r%   r&   r   r   loggerwarning_oncer/   rI   attention_dropoutr-   num_attention_heads	num_headsr   r   r   r]   
rope_theta	is_causalmodel_parallel_sizer   
ValueErrorr   r   attention_biasq_projk_projv_projo_projr   q_normk_norm
_init_roper,   r   r   r/   s      r0   r&   ChameleonAttention.__init__  s,   " !8!8 9 :, , "(!9!9!--33((DNN:#)#=#= $(NNd6N6N$N!'-'E'E$ ++#)#=#= }}d*MMDNN*t/?/??QRVRbRbQc$T^^$4B8 
 ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii 0 0$2B2BI^I^_($..$--)HI($*B*BDMM)RSr2   c                    U R                   R                  c/  [        U R                  U R                  U R
                  S9U l        g U R                   R                  S   nU R                   R                  S   nUS:X  a0  [        U R                  U R                  UU R
                  S9U l        g US:X  a0  [        U R                  U R                  UU R
                  S9U l        g [        SU 35      e)N)r]   r^   rl   factorlinear)r]   r[   r^   dynamiczUnknown RoPE scaling type )
r   rope_scalingrQ   r   r]   r   
rotary_embr}   r   r   )r,   scaling_typer[   s      r0   r   ChameleonAttention._init_rope(  s    ;;##+6(,(D(D__DO  ;;33F;L![[55h?Nx'"GMM,0,H,H#1	# *"KMM,0,H,H#1	# !#=l^!LMMr2   r=   r   ru   past_key_valueoutput_attentions	use_cachecache_positionr   c                 &   UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  SU R
                  U R                  5      nU R                  U5      nUR	                  SU R                  U R                  5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nU R                  X5      u  nn[        XUU5      u  pUb$  UXS.nUR                  XU R                  U5      u  p[         nU R"                  R$                  S:w  aT  U R"                  R$                  S:X  a  U(       a  [&        R)                  S5        O[*        U R"                  R$                     nU" U UUUU4U R,                  (       d  SOU R.                  U R0                  S	.UD6u  nnUR	                  XS5      R3                  5       nU R5                  U5      nU(       d  S nUUU4$ )
Nr5   r   r4   )rs   rr   r   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )sizer   r   r   r   r   r   r   r   r   rp   viewr   r   updater   r   r   _attn_implementationr   r   r   r   r   r   r   r   )r,   r=   r   ru   r   r   r   r   r   bszq_len_query_statesr   r   rr   rs   cache_kwargsattention_interfacer   r   s                        r0   r@   ChameleonAttention.forwardC  sN    &**,A{{=1[[/
{{=1#++BN{{<0''D,D,DdmmT
[[,
#++CV``abdef''D4L4Ldmm\ffghjkl
#((T5M5Mt}}]gghiklm??<>S#7RUWZ#[ %#&sUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "))#b9DDFkk+. LL.88r2   )r   r   r   r-   r   r   r   r   r]   r   r   r   r   r   r   r   r   r   r   r   r   NNNFFN)rI   rJ   rK   rL   r   r   r   intr&   r   r(   Tensor
LongTensorr   boolr   r@   rM   rN   rO   s   @r0   r   r      s    G# #8C= # #NN< 2637*."'59?9||?9 !.?9 u//0	?9
 !?9  ?9 ?9 !!1!12?9 
u||Xell3XeELL>Q5RR	S?9 ?9r2   r   c                   8  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )ChameleonDecoderLayeri  r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g N)r   r   r   r%   r&   r-   r   	self_attnr   mlpr"   rms_norm_epsinput_layernormpost_attention_layernormr   s      r0   r&   ChameleonDecoderLayer.__init__  k    !--+6O'/0B0BH[H[\(89K9KQWQdQd(e%r2   r=   r   ru   r   r   r   r   r   c                     Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pnX-   nUn	U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )at  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
r=   r   ru   r   r   r   r   r   )r  r  r  r  r,   r=   r   ru   r   r   r   r   r   residualself_attn_weightspresent_key_valueoutputss                r0   r@   ChameleonDecoderLayer.forward  s    < !,,]; ?Cnn 	?
')%)/)	?
 	?
;*; !0 !55mD/ 0 "++G++Gr2   r-   r  r  r  r  r  rI   rJ   rK   rL   r   r  r&   r(   r  r   r  r   r  r   FloatTensorr@   rM   rN   rO   s   @r0   r  r    s    f f3 f 2637*.,1$)59=||= !.= u//0	=
 != $D>= D>= !!1!12= 
u  (51B1BEDUDU1U+V"WW	X= =r2   r  c                   8  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )ChameleonSwinDecoderLayeri  r   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g r  r  r   s      r0   r&   "ChameleonSwinDecoderLayer.__init__  r  r2   r=   r   ru   r   r   r   r   r   c                     Un	U R                   " SUUUUUUUS.UD6u  pnU R                  U5      nX-   nUn	U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indices of positions of each input sequence tokens in the position embeddings
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
r  r   )r  r  r  r  r   s                r0   r@   !ChameleonSwinDecoderLayer.forward  s    > ! ?Cnn 	?
')%)/)	?
 	?
;*; ,,]; 0 /55mD 0 "++G++Gr2   r&  r  r'  rO   s   @r0   r*  r*    s    f f3 f 2637*.,1$)59;||; !.; u//0	;
 !; $D>; D>; !!1!12; 
u  (51B1BEDUDU1U+V"WW	X; ;r2   r*  c                   N   ^  \ rS rSrSrU 4S jrS\R                  4S jrSr	U =r
$ )ChameleonVQVAEVectorQuantizeri  a  
A module for vector quantization using learned embedding vectors.

This module implements the quantization process similar to te one described in
the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
input vectors into discrete codebook vectors, which are learned during training.
Current implementation improves over previous ones by avoiding costly matrix multiplications
and allowing for post-hoc remapping of indices.
c                    > [         TU ]  5         UR                  U l        UR                  U l        [        USS5      U l        [        R                  " U R                  U R                  5      U l	        g )Nbetag      ?)
r%   r&   num_embeddings	embed_dimembedding_dimgetattrr2  r   	Embedding	embeddingr   s     r0   r&   &ChameleonVQVAEVectorQuantizer.__init__%  sX    $33#--FFD1	d&9&94;M;MNr2   hidden_statec           
      >   UR                  SSSS5      R                  5       nUR                  SU R                  5      n[        R
                  " US-  SSS9[        R
                  " U R                  R                  S-  SS9-   S[        R                  " S	X R                  R                  R                  SS5      5      -  -
  n[        R                  " USS9nU R                  U5      R                  UR                  5      n[        R                  " UR                  5       U-
  S-  5      U R                  [        R                  " XQR                  5       -
  S-  5      -  -   nXU-
  R                  5       -   nUR                  SSSS5      R                  5       nXVU4$ )
Nr   r4   r	   r   r5   T)r\   r6   rj   z	bd,dn->bn)permuter   r  r5  r(   sumr8  r*   einsumrp   argminrD   r;   detachr2  )r,   r:  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantlosss          r0   r@   %ChameleonVQVAEVectorQuantizer.forward-  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e"5"5"77A=\
 P
 

 *,-N,V,V,XX 0771aCNNP!)===r2   )r2  r8  r5  r3  )rI   rJ   rK   rL   r   r&   r(   r  r@   rM   rN   rO   s   @r0   r0  r0    s#    O>ELL > >r2   r0  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )#ChameleonVQVAEEncoderConvDownsampleiI  c                 Z   > [         TU ]  5         [        R                  " XSSSS9U l        g )Nr	   r4   r   kernel_sizestridepadding)r%   r&   r   Conv2dconvr,   in_channelsr/   s     r0   r&   ,ChameleonVQVAEEncoderConvDownsample.__init__J  s%    IIkAaYZ[	r2   c                 V    [         R                  " USSSS9nU R                  U5      nU$ )N)r   r   r   r   constantr   )padmoder   )r   rU  rO  r   s     r0   r@   +ChameleonVQVAEEncoderConvDownsample.forwardN  s+    mJVWX		-0r2   )rO  r   rO   s   @r0   rH  rH  I  s    \ r2   rH  c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ ) ChameleonVQVAEEncoderResnetBlockiU  c                   > [         TU ]  5         X l        Uc  UOUU l        X@l        [
        R                  R                  SUSSS9U l        [
        R                  R                  X#SSSS9U l
        [
        R                  R                  SUSSS9U l        [
        R                  R                  UR                  5      U l        [
        R                  R                  X3SSSS9U l        U R                  U R                  :w  a]  U R                  (       a&  [
        R                  R                  X#SSSS9U l        g [
        R                  R                  X#SSSS9U l        g g )	N    rH   T
num_groupsnum_channelsr.   affiner	   r   rJ  r   )r%   r&   rQ  out_channelsuse_conv_shortcutr(   r   	GroupNormnorm1rN  conv1norm2Dropoutr   conv2conv_shortcutnin_shortcut)r,   r   rQ  r`  rh  r/   s        r0   r&   )ChameleonVQVAEEncoderResnetBlock.__init__V  s%    	&+7+?K\!.XX''2KUYbf'g
XX__[AVWab_c
XX''2LVZcg'h
xx''7XX__\QWXbc_d
t000%%%*XX__[\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r2   c                    UnU R                  U5      nU[        R                  " U5      -  nU R                  U5      nU R	                  U5      nU[        R                  " U5      -  nU R                  U5      nU R                  U5      nU R                  U R                  :w  a7  U R                  (       a  U R                  U5      nX!-   $ U R                  U5      nX!-   $ r   )rc  r(   sigmoidrd  re  r   rg  rQ  r`  ra  rh  ri  )r,   r=   r!  s      r0   r@   (ChameleonVQVAEEncoderResnetBlock.forwardm  s     

=1}55

=1

=1}55]3

=1t000%%--h7 ''  ,,X6''r2   )
rd  rg  rh  r   rQ  ri  rc  re  r`  ra  )NFr   rO   s   @r0   rY  rY  U  s    
 s.( (r2   rY  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ChameleonVQVAEEncoderAttnBlocki  c                   > [         TU ]  5         Xl        [        R                  R                  SUSSS9U l        [        R                  R                  XSSSS9U l        [        R                  R                  XSSSS9U l	        [        R                  R                  XSSSS9U l
        [        R                  R                  XSSSS9U l        g )Nr[  rH   Tr\  r   r   rJ  )r%   r&   rQ  r(   r   rb  normrN  r   r   vproj_outrP  s     r0   r&   'ChameleonVQVAEEncoderAttnBlock.__init__  s    &HH&&";TXae&f	qQR\]^qQR\]^qQR\]^aXYcder2   c                 Z   UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  u  pgpUR                  XgX-  5      R                  SSS5      nUR                  XgX-  5      n[        R                  " X45      n
U
[        U5      S-  -  n
[        R                  " U
SS9n
UR                  XgX-  5      nU
R                  SSS5      n
[        R                  " XZ5      R                  XgX5      nU R                  U5      nX+-   $ )Nr   r4   r   r   rj   )rq  r   r   rr  rD   r   r<  r(   bmmr  r   r   rs  )r,   r=   r!  r	  r   r   
batch_sizechannelsheightwidthr   r   s               r0   r@   &ChameleonVQVAEEncoderAttnBlock.forward  s    		-0vvm,VVM*
vvm, /;.@.@+
f#++J&.QYYZ[]^`ab''
fnM
yy:#s8}'>?yy15 $++J&.Q#++Aq!4ii;CCJZ`hmmK0%%r2   )rQ  r   rq  rs  r   rr  r   rO   s   @r0   ro  ro    s    f& &r2   ro  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )ChameleonVQVAEEncoderi  c           
        > [         TU ]  5         [        UR                  5      U l        UR
                  U l        UR                  nUR                  nUR                  nUR                  nUR                  nUR                  n[        R                  R                  XBSSSS9U l        UnS[        U5      -   n	Xl        [        R"                  " 5       U l        ['        U R                  5       GH%  n
[        R"                  " 5       n[        R"                  " 5       nX)U
   -  nX'U
   -  n['        U R
                  5       Hk  nUR)                  [+        UUUS95        UnUR,                  c  M.  XR,                  ;   d  M?  UR.                  S:X  d  MQ  UR)                  [1        U5      5        Mm     [        R2                  " 5       nUUl        UUl        XR                  S-
  :w  a  [9        U5      Ul        US-  nU R$                  R)                  U5        GM(     [        R2                  " 5       U l        [+        UWUS9U R<                  l        UR.                  S:X  a  [1        U5      O[        R@                  " 5       U R<                  l!        [+        UUUS9U R<                  l"        [        R                  RG                  SUS	S
S9U l$        [        R                  R                  UU(       a  SU-  OUSSSS9U l%        g )Nr	   r   rJ  )r   )r   rQ  r`  vanillar4   r[  rH   Tr\  )&r%   r&   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channels
resolutionrQ  double_latentlatent_channelsr(   r   rN  conv_inrC   in_channel_multiplier
ModuleListdownrangeappendrY  attn_resolutions	attn_typero  ModuleblockattnrH  
downsamplemidblock_1Identityattn_1block_2rb  norm_outconv_out)r,   r   r  r  rQ  r  r  r  curr_resr  i_levelr  r  block_in	block_outi_blockr  r/   s                    r0   r&   ChameleonVQVAEEncoder.__init__  s   "6#<#<=$33,,&&
((,, 00#66xx{qYZdef $u-?'@ @%:"MMO	T112GMMOE==?D$W'EEH%7(CCI !4!454%$,%. %++7 $;$;;((I5KK >x HI 6  99;DDJDI..22"Eh"O#q=IIT"7 3: 99;; !

 GMFVFVZcFc8Bikititiv; !
 **bxUYbf*g#0Ao ( 
r2   pixel_valuesc                 @   U R                  U5      /n[        U R                  5       H  n[        U R                  5       H  nU R                  U   R
                  U   " US   5      n[        U R                  U   R                  5      S:  a"  U R                  U   R                  U   " U5      nUR                  U5        M     X0R                  S-
  :w  d  M  UR                  U R                  U   R                  US   5      5        M     US   nU R                  R                  U5      nU R                  R                  U5      nU R                  R                  U5      nU R                  U5      nU[        R                   " U5      -  nU R#                  U5      nU$ )Nr5   r   r   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r(   rl  r  )r,   r  r=   r  r  r:  last_hidden_states          r0   r@   ChameleonVQVAEEncoder.forward  sr   l34T112G !4!45#yy177@!"%  tyy)../!3#'99W#5#:#:7#CL#QL$$\2 6 ..22$$TYYw%7%B%B=QSCT%UV 3 *"- HH,,->? HHOO,=> HH,,->? !MM*;<U]]+<== MM*;<  r2   )r  r  r  r  r  r  r  r  )
rI   rJ   rK   rL   r&   r(   r  r@   rM   rN   rO   s   @r0   r}  r}    s!    C
J!E$4$4 ! !r2   r}  c                       \ rS rSrSrS r\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       rS
\R                  S\R                  4S jrSrg)ChameleonImageVocabularyMappingi  zE
A class for mapping discrete image tokens from VQGAN to BPE tokens.
c                 <    Xl         UR                  S5      U l        g )Nz<image>)	vocab_mapgetimage_token_id)r,   r  s     r0   r&   (ChameleonImageVocabularyMapping.__init__
  s    "'mmI6r2   c                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r   )r  itemsr,   r   rr  s      r0   val2name(ChameleonImageVocabularyMapping.val2name  s-    !%!5!5!78!7!7888   0c           	          [        U R                  R                  5        VVs/ s H  u  pUR                  S5      (       d  M  UPM!     snn5      $ s  snnf )NIMGIMG)sortedr  r  
startswith)r,   namevals      r0   image_tokens,ChameleonImageVocabularyMapping.image_tokens  s<    DNN,@,@,B`,BytdooV^F_s,B`aa`s   A
A
c           
      (  ^ [        S5       Vs0 s H$  n[        [        S5      U-   5      [        U5      _M&     snmS[        S[        4U4S jjnU R                   Vs0 s H!  o3[        U" U R                  U   5      5      _M#     sn$ s  snf s  snf )N
   Aold_namer   c                 R   > SR                  U4S jU [        S5      S  5       5      $ )N c              3   F   >#    U  H  nTR                  X5      v   M     g 7fr   )r  ).0cimg_tkn_chr_mappings     r0   	<genexpr>IChameleonImageVocabularyMapping.bpe2img.<locals>.remap.<locals>.<genexpr>  s"     _B^Q.22188B^s   !r  r5   )joinr  )r  r  s    r0   remap6ChameleonImageVocabularyMapping.bpe2img.<locals>.remap  s$    77_(3x=[]B^___r2   )r  chrordrn   r  r  r  )r,   ir  tokr  s       @r0   bpe2img'ChameleonImageVocabularyMapping.bpe2img  s    BG)L)Qs3s8a<0#a&8)L	`C 	`C 	` @D?P?PQ?PSt}}S1233?PQQ M
 Rs   +B
(Bc                 l    U R                   R                  5        VVs0 s H  u  pX!_M	     snn$ s  snnf r   )r  r  r  s      r0   img2bpe'ChameleonImageVocabularyMapping.img2bpe  s-    !%!3!3!56!5!5666r  c                     [         R                  " [        U R                  R	                  5       5      5      [         R                  " [        U R                  R                  5       5      5      4$ r   )r(   tensorr  r  keysvaluesrE   s    r0   bpe2img_search_tensors6ChameleonImageVocabularyMapping.bpe2img_search_tensors#  sC    ||F4<<#4#4#678%,,vdllNaNaNcGd:eeer2   c                     [         R                  " [        U R                  R	                  5       5      S-   [         R
                  S9nU R                  R                  5        H	  u  p#X1U'   M     U$ )Nr   rU   )r(   zerosr   r  r  r  r  )r,   mappingr   rr  s       r0   img2bpe_mapping_tensor6ChameleonImageVocabularyMapping.img2bpe_mapping_tensor'  sR    ++c$,,"3"3"56:%))LLL&&(DAAJ )r2   	img_batchr   c                 x    UR                   nU R                  UR                  S5         nUR                  U5      $ )Nrg   )rW   r  r8   )r,   r  rW   
img_tokenss       r0   convert_img2bpe/ChameleonImageVocabularyMapping.convert_img2bpe.  s5    !!00e1DE
}}V$$r2   )r  r  N)rI   rJ   rK   rL   r   r&   r   r  r  r  r  r  r  r(   r  r  rM   r   r2   r0   r  r    s    7 9 9 b b R R 7 7 f f  % %%,, %r2   r  c                   R    \ rS rSr\rSrSrSS/rSS/r	Sr
SrSrSrSrSrSrS	 rS
rg)ChameleonPreTrainedModeli4  modelTr  r*  past_key_valuesr   Fc                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  [        R                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                   b2  UR                  R                  UR                      R                  5         g g g )Nr  )r;   stdrT   )r   initializer_rangerm   r   r   rN  r*   datanormal_r   zero_rb  	LayerNormfill_r"   r7  padding_idx)r,   r   r  s      r0   _init_weights&ChameleonPreTrainedModel._init_weightsC  s@   kk++fryy"))455MM&&CS&9{{&  &&( 'r|| <==KK""$MM$$S) 011MM$$S)--MM&&CS&9!!-""6#5#56<<> . .r2   r   N)rI   rJ   rK   rL   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignment_supports_attention_backendr  rM   r   r2   r0   r  r  4  sX    "L&*#02MN#4m"D!N $ !(-%"&?r2   r  aK  
    The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                   \   ^  \ rS rSr\rS/rS\4U 4S jjrS\R                  4S jr
SrU =r$ )ChameleonVQVAEiU  r0  r   c                 l  > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  R                  UR                  UR                  S5      U l        [        R                  R                  UR                  UR                  S5      U l        U R                  5         g Nr   )r%   r&   r}  encoderr0  quantizer(   r   rN  r  r4  
quant_convpost_quant_convevalr   s     r0   r&   ChameleonVQVAE.__init__`  s|     ,V45f=((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		r2   r  c                 v    U R                  U5      nU R                  U5      nU R                  U5      u  p4nX4U4$ r   )r  r  r  )r,   r  r=   quantemb_lossindicess         r0   encodeChameleonVQVAE.encodei  s<    \26#'==#? ''r2   )r  r  r  r  )rI   rJ   rK   rL   r   r  r  r&   r(   r  r  rM   rN   rO   s   @r0   r  r  U  s7     (L893 (5#3#3 ( (r2   r  c                   ~  ^  \ rS rSrS\4U 4S jjrS rS rS\R                  4S jr
S\R                  4S jr\           SS	\\R                     S\\R                     S
\\R                     S\\R                     S\\   S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       r S S
\\R                  S4   S\R                  S\R                  S\S\4
S jjr\S
\R                  S\S\S\R6                  S\R                  S\4S j5       rSrU =r$ )!ChameleonModelip  r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  5      U l        U R                  R                  (       d  [        O[        n[
        R                   " [#        UR$                  5       Vs/ s H
  o2" X5      PM     sn5      U l        [)        UR                  UR*                  S9U l        [.        R1                  UR2                  5      U l        SU l        U R9                  5         g s  snf )Nr   F)r%   r&   pad_token_idr  
vocab_sizer   r7  r-   embed_tokensr  vocabulary_mapvocabulary_mappingr   	swin_normr  r*  r  r  num_hidden_layerslayersr"   r  rq  r  _from_config	vq_configvqmodelgradient_checkpointing	post_init)r,   r   decoder_layerr   r/   s       r0   r&   ChameleonModel.__init__r  s     !.. ++LL):):F<N<NPTP`P`a"A&BWBW"X59[[5J5J-Pimm?DVE]E]?^_?^)]6-?^_
 %V%7%7V=P=PQ	%2263C3CD&+# 	 `s   Ec                     U R                   $ r   r  rE   s    r0   get_input_embeddings#ChameleonModel.get_input_embeddings  s       r2   c                     Xl         g r   r!  r,   r   s     r0   set_input_embeddings#ChameleonModel.set_input_embeddings  s    !r2   r  c                 N    [         R                  S5        U R                  U5      $ )Nz`model.get_image_tokens()` is deprecated and will be removed in v4.58. To obtain discrete token use `model.get_image_features()`)r   warningget_image_featues)r,   r  s     r0   get_image_tokensChameleonModel.get_image_tokens  s'     O	
 %%l33r2   c                     UR                   S   nU R                  R                  U5      u    p4U R                  R	                  U5      nUR                  US5      nU$ )a;  
Tokenizes images into discrete tokens with VQGAN module. Converts
obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
special tokens.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
        The tensors corresponding to the input images.
r   r5   )rD   r  r  r  r  r  )r,   r  rw  r  
image_toksbpe_tokss         r0   get_image_features!ChameleonModel.get_image_features  sX     "''*
<<..|<1**:::F==R0r2   	input_idsr   ru   r  inputs_embedsr   r   output_hidden_statesreturn_dictr   r   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
U R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUS L US L-  (       a  [        S5      eUb  Ub  [        S5      eUb  U R                  U5      nXR                  R                  :H  n[        5       (       dz  X   R                  5       UR                  5       :w  aV  XR                  R                  :H  R                  5       nUR                   S   UR                   S   -  n[        SU SU 35      eUR#                  UR$                  UR&                  5      nUR)                  X5      nUc  U R+                  U5      nU(       a0  Uc-  [,        R.                  R1                  5       (       d
  [3        5       nUcE  Ub  UR5                  5       OSn[,        R6                  " UUUR                   S   -   UR$                  S	9nUc  UR9                  S5      nU R;                  X6XU5      nUnU	(       a  S
OS nU(       a  S
OS nS nU R<                   H  nU	(       a  UU4-  nU R
                  (       a4  U R                  (       a#  U R?                  UR@                  UUUUUUU5      nOU" U4UUUUUUS.UD6nUS   nU(       a  UU(       a  SOS   nU(       d  M  UUS   4-  nM     U RC                  U5      nU	(       a  UU4-  nS nU(       a  UnU
(       d  [E        S UUUU4 5       5      $ [G        UUUUS9$ )NzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oner   r   z6Image features and image tokens do not match: tokens: z, features rW   r   )r   ru   r   r   r   r   r4   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   )r  rr  s     r0   r  )ChameleonModel.forward.<locals>.<genexpr>  s     t$bq$bs   	)r  r  r=   
attentions)$r   r   r4  r   use_return_dictr  r   r   r   r   r0  r  r  r   numelr=  rD   r8   rW   r7   masked_scatterr  r(   jit
is_tracingr   get_seq_lengthr_   r   _update_causal_maskr  _gradient_checkpointing_func__call__rq  rC   r   )r,   r2  r  r   ru   r  r3  r   r   r4  r5  r   r   r  special_image_maskn_image_tokens_in_textn_image_featurespast_seen_tokensr   r=   all_hidden_statesall_self_attnsnext_decoder_cacher  layer_outputs
next_caches                             r0   r@   ChameleonModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]&&4==Yj I-t";<YZZ#(Av  #22<@L!*.E.E.T.T!T+--)2O2U2U2W[g[m[m[o2o*37N7N7]7]*])b)b)d&#/#5#5a#8<;M;Ma;P#P  LMcLddo  qA  pB  C  (??9+;+;Y__ML!001CRI  --i8M 09M9M9O9O*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..>L]

 & #7BD0d!![[M#!m%55!**t}} $ A A!**! #%"	! !.!	!#.!-#2&7'#1	! 	! *!,M%28I1q%Q"  =#3"55C )F 		-0  -!11
+Jt]J@QSa$bttt&+&+%	
 	
r2   r   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r  flex_attentionr   Fr   )r3  past_key_values_lengthis_trainingr   r5   )sequence_lengthtarget_lengthr7   r   rw  )cudaxpunpu)r   r  anyrm   r(   r  r    r@  is_compileabler   _ignore_causal_mask_sdpar   r7   rD   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrW   rl   finfomin_unmask_unattended)r,   r   rN  r   r  r   rG  using_compilable_cacher7   rT  rU  r   	min_dtypes                r0   rA  "ChameleonModel._update_causal_mask%  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr2   rT  rU  r7   rw  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer7   rW   r   )diagonalr7  r5   r   )r\   r(   r^  r_  fullrW   triur_   r   rk   clonerD   r8   masked_fill)r   rT  rU  r7   r   rw  r   r   rb  mask_lengthpadding_masks              r0   r]  DChameleonModel._prepare_4d_causal_attention_mask_with_cache_positioni  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r2   )r  r  r  rq  r  r  r  r  )NNNNNNNNNNN)F)rI   rJ   rK   rL   r   r&   r"  r&  r(   r(  r+  r0  r   r   r  r  r   r  r   r   r   r   r   r@   rA  staticmethodr  r7   r]  rM   rN   rO   s   @r0   r  r  p  s    $!"4U->-> 4u/@/@    15481537+/59$(,0/3&*59A
E,,-A
 u001A
 !.	A

 u//0A
 "%A
   1 12A
 D>A
 $D>A
 'tnA
 d^A
 !!1!12A
 -.A
 
u--	.A
 A
T #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r2   r  c                       \ rS rSrSrg)KwargsForCausalLMi  r   N)rI   rJ   rK   rL   rM   r   r2   r0   rq  rq    s    3r2   rq  zb
    Chameleon Model with a head on top used for outputting logits for next token prediction.
    c            !         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\\            SS
\\R                      S\\R"                     S\\R$                     S\\R                      S\\   S\\R"                     S\\R                      S\\   S\\   S\\   S\\   S\\R                      S\\   S\\\4   4S jj5       5       r       SU 4S jjrSrU =r$ )!ChameleonForConditionalGenerationi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r%   r&   r  r  r  r   r   r-   lm_headr  r   s     r0   r&   *ChameleonForConditionalGeneration.__init__  sU     #F+
 ++yy!3!3V5F5FUS 	r2   c                 .    U R                   R                  $ r   r  r  rE   s    r0   r"  6ChameleonForConditionalGeneration.get_input_embeddings  s    zz&&&r2   c                 $    XR                   l        g r   rx  r%  s     r0   r&  6ChameleonForConditionalGeneration.set_input_embeddings  s    "'

r2   c                     U R                   $ r   ru  rE   s    r0   get_output_embeddings7ChameleonForConditionalGeneration.get_output_embeddings  s    ||r2   c                     Xl         g r   r}  )r,   new_embeddingss     r0   set_output_embeddings7ChameleonForConditionalGeneration.set_output_embeddings  s    %r2   c                     Xl         g r   r  )r,   decoders     r0   set_decoder-ChameleonForConditionalGeneration.set_decoder  s    
r2   c                     U R                   $ r   r  rE   s    r0   get_decoder-ChameleonForConditionalGeneration.get_decoder  s    zzr2   r2  r  r   ru   r  r3  labelsr   r   r4  r5  r   r   r   c                 f   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUUU	U
UUS.UD6nUS   nU R                  U5      nU R                  R                  R                  n[        R                  " UR                  5      R                  USS2SS2U4'   SnUb)  U R                  " SUXpR                   R                  S.UD6n[        UUUR                  UR                   UR"                  S9$ )aE  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
>>> import torch
>>> import requests
>>> from PIL import Image

>>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16)
>>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")

>>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.<image><image>I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation."
>>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
>>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)

>>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)

>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```N)r2  r  r   ru   r  r3  r   r   r4  r5  r   r   )logitsr  r  )rE  r  r  r=   r:  r   )r   r   r4  r;  r  ru  r  r  r(   r^  r7   r_  loss_functionr  r   r  r=   r:  )r,   r2  r  r   ru   r  r3  r  r   r   r4  r5  r   r   r$  r=   r  r  rE  s                      r0   r@   )ChameleonForConditionalGeneration.forward  sE   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ** 
%)%+'/!5#)
 
  
m, zz44AA%*[[%>%B%Bq!\!"%%pVF{{OeOepiopD%#33!//))
 	
r2   c	                 V   > [         TU ]  " U4UUUUUUUS.U	D6n
US   S:w  a  S U
S'   U
$ )N)r  r  r   r3  r   ru   r   r   r  )r%   prepare_inputs_for_generation)r,   r2  r  r  r   r3  r   ru   r   r   model_inputsr/   s              r0   r  ?ChameleonForConditionalGeneration.prepare_inputs_for_generation  s\     w<

%+)')%

 

 !! ,0L(r2   )ru  r  r  )NNNNNNNNNNNN)NNNNNNT)rI   rJ   rK   rL   _tied_weights_keysr&   r"  r&  r~  r  r  r  r   r   r   r(   r  r(  r  r   r  r   rq  r   r   r   r@   r  rM   rN   rO   s   @r0   rs  rs    s    ++'(&  15481537+/59-1$(,0/3&*59Q
E,,-Q
 u001Q
 !.	Q

 u//0Q
 "%Q
   1 12Q
 ))*Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 !!1!12Q
 *+Q
 
u,,	-Q
  Q
l  r2   rs  )rs  r  r  r  r  )r  )Sr   	functoolsr   typingr   r   r   r   r(   torch.nn.functionalr   r   r   torch.utils.checkpointactivationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_chameleonr   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerrI   r   r  r"   r  rQ   r}   r   r   r   r   r  r   r  r  r   ra   r   r   r  r*  r0  rH  rY  ro  r}  r  r  r  r  rq  rs  __all__r   r2   r0   <module>r     s    % 3 3      ! . ) > B O F & 1  K  !!;J 
		H	%Jryy J(   , -
<ryy <@,D 0H *(8299 " &	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %4D9 D9PHBII HVF		 FR,>BII ,>^	")) 	)(ryy )(X &RYY  &F^!BII ^!B,% ,%^ ? ? ?@ (- ((( n- n nb	 ?,j > 
R(@/ R
Rj pr2   