
    fThT                       S SK Jr  S SKJrJrJrJrJr  S SKrS SK	J
r
  SSKJr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2  \*" 5       (       a  S SK3J4r4  SSK5J6r6  \+Rn                  " \85      r9\ " S S\'5      5       r:\(" SS9\( " S S\"5      5       5       r;\" S5       " S S \
Rx                  5      5       r= " S! S"\
Rx                  5      r> " S# S$\
Rx                  5      r?S% r@SHS& jrAS'\R                  S(\CS)\R                  4S* jrD SIS+\
Rx                  S,\R                  S-\R                  S.\R                  S/\\R                     S0\ES1\E4S2 jjrF " S3 S4\
Rx                  5      rG " S5 S6\5      rH\( " S7 S8\;5      5       rI " S9 S:\
Rx                  5      rJ " S; S<\\&5      rK\(" S=S9 " S> S?\;\5      5       rL " S@ SA\
Rx                  5      rM\( " SB SC\;5      5       rN\(" SDS9 " SE SF\;\25      5       rO/ SGQrPg)J    )	dataclass)CallableListOptionalTupleUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	AutoModel   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixin)	BlockMask)make_flex_block_causal_maskc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   Sr\\R                     \	S
'   Sr\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Srg)CsmOutputWithPast6   a  
Base class for the model autoregressive outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss )__name__
__module____qualname____firstlineno____doc__r*   r   torchFloatTensor__annotations__r+   r,   r   r-   r.   r/   r0   r1   r2   r3   r4   __static_attributes__r5       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/csm/modeling_csm.pyr(   r(   6   s(   *X )-D(5$$
%, $FE$AEOXeE%*;*;$<=>E=AM8E%"3"3S"89:A:>Ju00#567>6:!2!23:.2%++2OS!8E%8I8I2J,K#LSKO%0A0A30F*G!HOHLhuU->->-C'DEL15M8E--.5r?   r(   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    )custom_introc                   J    \ rS rSr\rSrSrS/rS/r	Sr
SrSrSrSrSrS rSrg)	CsmPreTrainedModelq   modelTCsmDecoderLayerr,   c                 T   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        5      (       aI  UR                  n[        US-
  5       H*  nUR
                  R                  U   R                  SUS9  M,     g [        U[        5      (       a&  UR
                  R                  R!                  S5        g g )N        )meanstdr!   g      ?)configinitializer_range
isinstancennLinearweightdatanormal_biaszero_	Embeddingpadding_idxCsmCodebooksHeadnum_codebooksrange
CsmRMSNormfill_)selfmodulerJ   rX   is        r@   _init_weights CsmPreTrainedModel._init_weights   s=   kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . 011"00M=1,-""1%--3C-@ .
++MM$$S) ,r?   r5   N)r6   r7   r8   r9   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr_   r>   r5   r?   r@   rC   rC   q   sQ     L&*#*+#4"5!N ! $!"&*r?   rC   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )rZ      c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z)
CsmRMSNorm is equivalent to T5LayerNorm
N)super__init__rN   	Parameterr;   onesrP   variance_epsilon)r\   hidden_sizeeps	__class__s      r@   rq   CsmRMSNorm.__init__   s/     	ll5::k#:; #r?   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor;   float32powrI   rsqrtrt   rP   )r\   r-   input_dtypevariances       r@   forwardCsmRMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r?   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tuplerP   shapert   r\   s    r@   
extra_reprCsmRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr?   )rt   rP   )gư>)	r6   r7   r8   r9   rq   r   r   r>   __classcell__rw   s   @r@   rZ   rZ      s    $;J Jr?   rZ   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )CsmRotaryEmbedding   rK   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF
persistent)rp   rq   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrK   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r\   rK   devicer   rw   s       r@   rq   CsmRotaryEmbedding.__init__   s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r?   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rz   r!   mpscpuF)device_typeenabledr   dim)r|   )r   floatexpandr   r}   r   rM   r   strr;   autocast	transposecatcosr   sinr|   )
r\   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r@   r   CsmRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   rK   r   r   r   r   r   N)r6   r7   r8   r9   r"   rq   r;   no_gradr   r   r>   r   r   s   @r@   r   r      s6    /y / /" ]]_<  <r?   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CsmMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )NrS   )rp   rq   rK   ru   intermediate_sizerN   rO   mlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnr\   rK   rw   s     r@   rq   CsmMLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r?   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r   r   r   r   )r\   r   r   s      r@   r   CsmMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r?   )r   rK   r   r   ru   r   r   r6   r7   r8   r9   rq   r   r>   r   r   s   @r@   r   r      s    0 r?   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nrz   r   r   )r   r;   r   )r   x1x2s      r@   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r?   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r@   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr?   r-   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)r   r   reshape)r-   r   batchnum_key_value_headsslenhead_dims         r@   	repeat_kvr     s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr?   r]   querykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r	   rz   )r   r|   )ptrainingr!   )r   num_key_value_groupsr;   matmulr   r   rN   
functionalsoftmaxr~   r}   r|   r   r   
contiguous)r]   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r@   eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r?   c                   F  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )CsmAttentioni'  z=Multi-headed attention from 'Attention Is All You Need' paperrK   	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )Nr   g      Tr   )rp   rq   rK   r   getattrru   num_attention_headsr   r   r   r   attention_dropout	is_causalrN   rO   attention_biasq_projk_projv_projo_projr\   rK   r   rw   s      r@   rq   CsmAttention.__init__*  sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r?   r-   position_embeddingsr   past_key_valuecache_positionr   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S	5        O[         U R                  R                     nU" U U	U
UU4U R"                  (       d  S
OU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nrz   r!   r   )r   r   r  eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rH   )r   r   )r   r   r   viewr   r  r  r   updater   r   rK   _attn_implementationr   loggerwarning_oncer   r   r   r   r   r   r  )r\   r-   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r@   r   CsmAttention.forwardA  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r?   )r   rK   r   r   r  r   r   r  r   r   r  )NN)r6   r7   r8   r9   r:   r"   intrq   r;   Tensorr   r   r   
LongTensorr   r   r   r>   r   r   s   @r@   r   r   '  s    G
y 
S 
8 +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0) 0)r?   r   c                     ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )rF   it  rK   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)rK   r   rv   )rp   rq   ru   r   	self_attnr   mlprZ   rms_norm_epsinput_layernormpost_attention_layernormr  s      r@   rq   CsmDecoderLayer.__init__u  si    !--%VI&>)&*<*<&BUBUV(263E3E6K^K^(_%r?   r-   r   r   r  r  	use_cacher  r  r   r   c	                     Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pX-   nUn
U R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )N)r-   r   r   r  r  r$  r  r  r5   )r!  r  r"  r  )r\   r-   r   r   r  r  r$  r  r  r   residualself_attn_weightsoutputss                r@   r   CsmDecoderLayer.forward  s     !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !0 !55mD/ 0 "++Gr?   )ru   r!  r  r"  r  )NNNFFNN)r6   r7   r8   r9   r"   r  rq   r;   r  r   r  r   boolr   r   r   r<   r   r>   r   r   s   @r@   rF   rF   t  s   `y `S ` 2637*.,1$)59KO'||' !.' u//0	'
 !' $D>' D>' !!1!12' &eELL%,,,F&GH' -.' 
u  (51B1BEDUDU1U+V"WW	X' 'r?   rF   c                   .  ^  \ rS rSr\rU 4S jrS rS r\	\
          SS\R                  S\\R                     S\\R                     S\\R                     S	\\   S
\\R                     S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       r SS\\R                  S4   S\R                  S\R                  S	\S\4
S jjr\S\R                  S\S\S\R6                  S\R                  S\4S j5       rSrU =r$ )CsmDepthDecoderModeli  c           	      j  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  -  UR                  5      U l	        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                   UR"                  S9U l        ['        US9U l        SU l        [
        R,                  " UR                  UR                   SS9U l        U R1                  5         g s  snf )Nr  rK   Fr   )rp   rq   pad_token_idrV   
vocab_sizerN   rU   rX   backbone_hidden_sizeembed_tokens
ModuleListrY   num_hidden_layersrF   layersrZ   ru   r   normr   
rotary_embgradient_checkpointingrO   inputs_embeds_projector	post_initr  s      r@   rq   CsmDepthDecoderModel.__init__  s     !.. ++LL&*>*>ARAR*RU[UpUpqmmAFvG_G_A`aA`I_V/A`a
 v11v7J7JK	,F;&+#')yy1L1LfN`N`gl'm$ 	 bs   D0c                     U R                   $ r   r2  r   s    r@   get_input_embeddings)CsmDepthDecoderModel.get_input_embeddings         r?   c                     Xl         g r   r=  r\   r   s     r@   set_input_embeddings)CsmDepthDecoderModel.set_input_embeddings      !r?   	input_idsbackbone_last_hidden_stater   r   r,   inputs_embedsr$  r  output_hidden_statesr  flash_attn_kwargsr   c                    Ub:  [         R                  R                  5       (       d  [        R	                  S5        SnUb  UOU R
                  R                  nU	b  U	OU R
                  R                  n	Ub  UOU R
                  R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R	                  S5        SnU(       a  Uc
  [        5       nU
ci  Ub  UR                  5       OSnUb  UR                  S   OUR                  S   nUb  UR                  OUR                  n[         R                   " XU-   US9n
Uc  [         R"                  " U
S-
  SS	9nXR$                  -  nU R'                  UU-   5      nU
S   S:H  nUb	  X&SS2S4'   O?[         R                  R                  5       (       d  U(       a  [        R)                  S
5        U R+                  U5      nU R-                  X6XU5      nUnU
R/                  S5      nU R1                  UU5      nU	(       a  SOSnU(       a  SOSnU R2                  SU R
                  R4                    H8  nU	(       a  UU4-  nU" U4UUUUUU
US.UD6nUS   nU(       d  M/  UUS   4-  nM:     U R7                  U5      nU	(       a  UU4-  n[9        UU(       a  UOSUUS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.X`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r!   r   )minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.r5   r   r   r  r  r$  r  r  last_hidden_stater,   r-   r.   )r;   compileris_compilingr  r  rK   r  rI  r$  
ValueErrorr8  r   r   get_seq_lengthr   r   arangeclampr0  r2  warningr9  _update_causal_maskr   r7  r5  r4  r6  r   )r\   rF  rG  r   r   r,   rH  r$  r  rI  r  rJ  past_seen_tokensinputs_seq_lengthr   codebook_idxsoffsetinput_ids_are_first_codebookr   r-   r  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                            r@   r   CsmDepthDecoderModel.forward  s   * #ENN,G,G,I,IM  L1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<Z[[&&4==Yj I0*nO!CRC^==?de:G:S 3 3A 6YbYhYhijYk-:-F]))IL\L\F"\\*:O`<`iopN !KK(:BM"__4F --i&.@AM+9!+<+A()5&@ad#~~22449UNN Q 44]C..>L]
 & &//2"oom\J #7BD0d![[)H4;;+H+HIM#!m%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  -!11&+/8Od+%	
 	
r?   r%   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ Nflash_attention_2rH   flex_attentionr   Fr  )rH  past_key_values_lengthis_trainingr!   rz   )sequence_lengthtarget_lengthr|   r  
batch_size)cudaxpunpurK   r  anyrM   r;   r  r&   rU  is_compileabler   _ignore_causal_mask_sdpar   r|   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finforN  _unmask_unattendedr\   r   rd  r  r,   r  rZ  using_compilable_cacher|   rk  rl  r   	min_dtypes                r@   rY  (CsmDepthDecoderModel._update_causal_mask7      ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr?   rk  rl  r|   rm  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ 	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer|   r   r!   )diagonalrM  rz   r   r   r;   rw  rN  fullr   triurV  r   r   cloner   r}   masked_fillr   rk  rl  r|   r  rm  r   r   r{  mask_lengthpadding_masks              r@   rv  JCsmDepthDecoderModel._prepare_4d_causal_attention_mask_with_cache_position{  }   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r?   )r2  r8  r9  r5  r6  rV   r7  r0  )
NNNNNNNNNNF)r6   r7   r8   r9   r#   ra   rq   r>  rC  r   r   r;   r  r   r<   r  r   r*  r   r   r   r   r   r   rY  staticmethodr  r|   rv  r>   r   r   s   @r@   r,  r,    s   (L !"  '+BF1537+/59$(,0/359p
##p
 %-U->->$?p
 !.	p

 u//0p
 "%p
   1 12p
 D>p
 $D>p
 'tnp
 !!1!12p
 $$89p
 
u--	.p
  p
p #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r?   r,  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )rW   i  c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g Nr!   )rp   rq   rX   rN   rr   r;   emptyrP   )r\   ru   rX   r0  rw   s       r@   rq   CsmCodebooksHead.__init__  s:    *ll5;;t/A/AA/E{#_`r?   c           
         Uc3  UR                   S   nU R                  [        R                  " U5         nOUS-
  nU R                  U   n[	        UR                   S   5       Vs/ s H9  n[
        R                  R                  US S 2US S 24   XF   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr!   r   r   )
r   rP   r;   rV  rY   rN   r   linearTstack)r\   r-   r  
seq_lengthcodebook_weightr\  codebook_idxs          r@   r   CsmCodebooksHead.forward  s    !&,,Q/J"kk%,,z*BCO*Q.M"kk-8O !&o&;&;A&> ?
 ? MM  q,/A!BODaDcDcd ? 	 
 Mq9
s   %A B>)rX   rP   r   r   r   s   @r@   rW   rW     s    a
 r?   rW   c                       \ rS rSrSrg)KwargsForCausalLMi  r5   N)r6   r7   r8   r9   r>   r5   r?   r@   r  r    s    3r?   r  a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c            !       p  ^  \ rS rSrSrSrSrU 4S jrS rS r	S r
S r\\            SS\R                  S	\\R"                     S
\\R$                     S\\R                     S\\\\\R"                     4      S\\R"                     S\\R                     S\\   S\\   S\\   S\\R                     S\\\R$                  4   S\\   S\\\4   4S jj5       5       r    SS\R                  S\\   S
\\R                     S\\R"                     S\\R                     4
U 4S jjjrSrU =r$ )CsmDepthDecoderForCausalLMi  Nc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [        UR                  UR                  UR                  5      U l        U R                  5         g r   )
rp   rq   r,  rE   r0  rW   ru   rX   codebooks_headr:  r   s     r@   rq   #CsmDepthDecoderForCausalLM.__init__  sY     )&1
 ++.v/A/A6CWCWY_YjYjk 	r?   c                 .    U R                   R                  $ r   rE   r2  r   s    r@   r>  /CsmDepthDecoderForCausalLM.get_input_embeddings  s    zz&&&r?   c                 $    XR                   l        g r   r  rB  s     r@   rC  /CsmDepthDecoderForCausalLM.set_input_embeddings  s    "'

r?   c                     Xl         g r   rE   )r\   decoders     r@   set_decoder&CsmDepthDecoderForCausalLM.set_decoder  s    
r?   c                     U R                   $ r   r  r   s    r@   get_decoder&CsmDepthDecoderForCausalLM.get_decoder  s    zzr?   rF  rG  r   r   r,   rH  labelsr$  r  rI  r  logits_to_keepr   r   c                 n   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	U
US.
UD6nUS   n[	        U[
        5      (       a!  US:X  a  [        SS5      nO[        U* S5      nOUnU R                  USS2USS24   Ub  UU   OS5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " SUSU R                   R                  US.UD6n[        UUUR                  UR                  UR                  S9$ )	a  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)
rF  rG  r   r   r,   rH  r$  r  rI  r  r   r!   .)r+   r  r0  shift_labels)r*   r+   r,   r-   r.   r5   )rK   r  rI  rE   rM   r  slicer  r   loss_functionr0  r   r,   r-   r.   )r\   rF  rG  r   r   r,   rH  r  r$  r  rI  r  r  r   r(  r-   slice_indicesr+   r*   r  s                       r@   r   "CsmDepthDecoderForCausalLM.forward  sx   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ** 
'A)%+'/!5)
 
  
nc**" %a %~ot <*M$$!]A-.Q_Qk}0Mqu
 ""$!#qr'?557L%% dt{{7M7M\hlrD &#33!//))
 	
r?   c                    > [         T	U ]  " XX4U40 UD6nUS   S   S:H  nU(       d  UR                  S5        UR                  S5        U$ )Nr  r   rG  r   )rp   prepare_inputs_for_generationpop)
r\   rF  r,   r   rH  r  r   model_inputsis_first_generation_steprw   s
            r@   r  8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation<  sc     w<~
Y_
 $00@#A!#D#I '9: 	(r?   )r  rE   r0  )NNNNNNNNNNNr   NNNN) r6   r7   r8   r9   _tied_weights_keys_tp_plan_pp_planrq   r>  rC  r  r  r   r   r;   r  r   r<   r  r   r   r   r*  r  r   r  r   r   r   r  r>   r   r   s   @r@   r  r    s	    HH'(  '+BF1537KO59-1$(,0/35934J
##J
 %-U->->$?J
 !.	J

 u//0J
 "%tE4E4E/F(F"GHJ
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 !!1!12J
 c5<</0J
 *+J
 
u,,	-J
  J
^ ,0595959## "% !!1!12	
   1 12 !!1!12 r?   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CsmBackboneModelEmbeddingsiS  c                   > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )Naudio_tokens_offsetsFr   )rp   rq   rN   rU   rX   r0  ru   embed_audio_tokensr   r;   rV  r   s     r@   rq   #CsmBackboneModelEmbeddings.__init__T  sn    "$,,0D0DvGXGX0X[a[m[m"n"ELL1E1E$FIZIZ$Zgl 	 	
r?   c                 ^    U R                  XR                  -   5      nUR                  SS9nU$ )Nr   r   )r  r  sum)r\   rF  input_embedss      r@   r   "CsmBackboneModelEmbeddings.forward[  s4    ..y;T;T/TU#''A'.r?   )r  r   r   s   @r@   r  r  S  s    
 r?   r  c                     ^  \ rS rSrU 4S jrS rS r\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\   S\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       r SS\\
R                  S4   S\
R                  S\
R                  S\S\4
S jjr\S\
R                  S\S\S\
R0                  S\
R                  S\4S j5       rSrU =r$ )CsmBackboneModelia  c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr  r.  F)rp   rq   r/  rV   r0  r  r2  rN   r3  rY   r4  rF   r5  rZ   ru   r   r6  r   r7  r8  r:  r  s      r@   rq   CsmBackboneModel.__init__c  s     !.. ++6v>mmAFvG_G_A`aA`I_V/A`a
 v11v7J7JK	,F;&+# 	 bs   *Cc                     U R                   $ r   r=  r   s    r@   r>  %CsmBackboneModel.get_input_embeddingsr  r@  r?   c                     Xl         g r   r=  rB  s     r@   rC  %CsmBackboneModel.set_input_embeddingsu  rE  r?   rF  r   r   r,   rH  r$  r  rI  r  rJ  r   c
                 J   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        Sn[        U[        S5      [        45      (       d  [	        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                   " XUR"                  S   -   UR$                  S9n	Uc  U	R'                  S5      nU R)                  X%XU5      nUnU R+                  X5      nU(       a  S	OSnU(       a  S	OSnU R,                  SU R                   R.                    H7  nU(       a  X4-  nU" U4UUUUUU	US
.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R1                  U5      nU(       a  X4-  n[3        UU(       a  UOSUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nz:You must specify exactly one of input_ids or inputs_embedsrL  FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r!   rM  r5   rO  rP  )rK   r  rI  r$  rT  r8  r   r  r  rM   r   r   r2  r   rU  r;   rV  r   r   r   rY  r7  r5  r4  r6  r   )r\   rF  r   r   r,   rH  r$  r  rI  r  rJ  rZ  r   r-   r  r_  r`  ra  rb  s                      r@   r   CsmBackboneModel.forwardx  sI   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I /DJ+>??abb  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r?   r%   rd  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ rf  rq  ry  s                r@   rY  $CsmBackboneModel._update_causal_mask  r}  r?   rk  rl  r|   rm  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ r  r  r  s              r@   rv  FCsmBackboneModel._prepare_4d_causal_attention_mask_with_cache_position(  r  r?   )r2  r8  r5  r6  rV   r7  r0  )	NNNNNNNNNr  )r6   r7   r8   r9   rq   r>  rC  r   r   r   r;   r  r  r   r<   r*  r   r   r   r   r   rY  r  r  r|   rv  r>   r   r   s   @r@   r  r  a  s   !"  151537+/59$(,0/359h
E,,-h
 !.h
 u//0	h

 "%h
   1 12h
 D>h
 $D>h
 'tnh
 !!1!12h
 $$89h
 
!h
  h
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r?   r  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c            #         ^  \ rS rSrSS/rU 4S jrS rS rS rS r	S	 r
\U 4S
 j5       rU 4S jr    S$S\\R                      S\\R                      S\\R                      S\\R                      S\\R                      4
S jjr    S$S\R$                  S\\   S\\R$                     S\\R(                     S\\R$                     4
U 4S jjjr\\             S%S\R$                  S\\R                      S\\R                      S\\R                      S\\R$                     S\\\\\R(                     4      S\\R(                     S\\R$                     S\\   S\\   S\\   S\\R$                     S\\\R                   4   S\\   S\\\4   4S jj5       5       r \!S\R                   S\S\S \RD                  S\R                   S!\4S" j5       r#S#r$U =r%$ )&CsmForConditionalGenerationi`  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g )NFr   )rp   rq   r0  rN   rO   ru   lm_headrU   text_vocab_sizeembed_text_tokensr  _from_configbackbone_modelr  depth_decoder_configdepth_decoderr    from_configcodec_configcodec_modelr:  r   s     r@   rq   $CsmForConditionalGeneration.__init__k  s      ++yy!3!3V5F5FUS!#f.D.DfFXFX!Y.;;FC7DDVE`E`a$001D1DE 	r?   c                 .    U R                   R                  $ r   r  r2  r   s    r@   r>  0CsmForConditionalGeneration.get_input_embeddingsw  s    ""///r?   c                 $    XR                   l        g r   r  rB  s     r@   rC  0CsmForConditionalGeneration.set_input_embeddingsz  s    +0(r?   c                     U R                   $ r   r  r   s    r@   get_output_embeddings1CsmForConditionalGeneration.get_output_embeddings}  s    ||r?   c                     Xl         g r   r  )r\   new_embeddingss     r@   set_output_embeddings1CsmForConditionalGeneration.set_output_embeddings  s    %r?   c                     U R                   R                  (       aO  U R                  U R                  R                  R
                  U R                  R                  R                  5        g g r   )rK   tie_codebooks_embeddings_tie_or_clone_weightsr  r2  r  r  rE   r   s    r@   _tie_weights(CsmForConditionalGeneration._tie_weights  sL    ;;//&&##00CC""((55 0r?   c                    > UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 s H"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )Noutput_loading_infoFdepth_decoder__from_model_config)r   rp   from_pretrainedlenvarsgeneration_configitems
startswithr  r  delattr)clsargsr   rE   loading_infoprefix
prefix_lenattrr   depth_decoder_attrsrw   s             r@   r  +CsmForConditionalGeneration.from_pretrained  s   ::+U33"''"94"J6"JE<G+T<V<E "[
  $E$;$;<BBD
Dv& %Du$D 	 
 	U  223::<PRW;o[n;op (DE++V]; ( !F*,&&L
s   /C:	C:c                    > SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr  transformers_version)r  r  to_diff_dictr  r  setattrrp   save_pretrained)r\   r  r   r  r  r
  r   rw   s          r@   r  +CsmForConditionalGeneration.save_pretrained  sq    !"00BBOOQ 6=.446KDD**FM5A 7 	00r?   rF  input_valuesinput_values_cutoffsr  r   c                    U R                  U5      nUGb+  [        R                  R                  US5      nX3S:     R	                  5       nXfS:     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n/ n[        X#5       H  u  pXS:     n
[        U
R                  S   S-
  5       Hp  nX   nXS-      nU	SX24   nU R                  R!                  UR                  S5      5      nUR"                  R%                  SS5      nUR'                  US   5        Mr     M     [        S U 5       5      n[
        R(                  " U Vs/ s H7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                  R+                  U5      nU R,                  R.                  nUU:H  nU R0                  R3                  U5      nUU   UU'   [
        R4                  " SSU R,                  R6                  4UR                  [
        R8                  S	9U R,                  R:                  -  nU R0                  R3                  U5      R=                  S5      nXR,                  R>                  :H  nURA                  URC                  5       S5      UU'   Ubb  UR                  S5      RA                  SSU R,                  R6                  5      nUU   UU'   US
:H  RE                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf )a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retreive codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
Nr!   r   r   rM  rz   r!   .c              3   >   #    U  H  oR                   S    v   M     g7f)r   N)r   ).0els     r@   	<genexpr>QCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>  s     "K9J288A;9Js   )r   r|   iTas_tuple)rH  r  )#r  rN   r   paddiffr;   rV  maxr   r   r  r   ziprY   r   r  encodeaudio_codesr   appendr  get_audio_codes_maskrK   audio_token_idr  r2  rs   rX   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr  nonzero)r\   rF  r  r  r  rH  audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsr^   	start_idxend_idxaudio_batchcodec_outputscodebook_idsmax_audio_framesr  batched_audio_token_idsaudio_codes_maskr'  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxss                                r@   "_merge_input_ids_with_input_values>CsmForConditionalGeneration._merge_input_ids_with_input_values  sQ   * ..y9##%==#4#45I6#R 01JKPPRM)!*;<M %-A-E-E-GP\PcPc d k kM"B! !24K4KA4N N
 !#BElBi>"-GfgHg-h*9??BQFGA : =I8Q?G"4S):K5K"LK$($4$4$;$;K<Q<QRS<T$UM#0#<#<#F#Fq"#ML%,,\!_= H Cj  #"K9J"KK&+kk\mn\mVX""21a1ABHHQK1O'PQ\mn'#  $//DDEVW![[77N(N:..;;<STL.:;K.LM*+ 

Aq$++";";<YEUEU]b]g]gh++334    $22??@ST\\]^_#,0N0N#N 2B2I2IJ^JbJbJdfg2hM./ !"("2"22"6"="=aDKKD]D]"^4KL\4] 014:dN3K3KUY3K3Z0pt @ CEefgEhjkjl lm(!.AA; os   =>Mr,   r   rH  r  c           	      2  > [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)rF  r,   r   rH  r  r   rH  r  r  r  )rF  r  r  r  )rH  r  rF  r5   )rp   r  ndimr   rB  r  )
r\   rF  r,   r   rH  r  r   r  merged_inputsrw   s
            r@   r  9CsmForConditionalGeneration.prepare_inputs_for_generation  s     w< 
+)')
 
  Y^^q%8\=M=Mo=^=f CC##ZZ7%+ZZ0F%Gzz(+	 D M "/"@MZbLcrvw r?   r   r$  r  rI  r  r   c                 L   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb.  UR                  S:X  a  U R	                  XXH5      nUS   nUS   nSnU R
                  " SUUUUUU	U
UUS.	UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnSnSnSnUb  USS2SS2S4   nU R                  " SUUU R                   R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                   R                  S-
  24   n[        R                  R!                  USSS9nUR#                  SS9nUUS   US   S-
  SS24   nUU   nU R%                  UUU	U
USUS9nUR&                  nUU-   n[)        UUUUUR*                  UR,                  UR.                  Ub  UR0                  OSUb  UR*                  OSUb  UR,                  OSUb  UR.                  S9$ SS9$ )a_  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be infered from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "eustlb/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr   rH  r  )	rF  r   r   r,   rH  r$  r  rI  r  r   )r+   r  r0  r!   r  rz   r   .r  )r   Tr  )rF  rG  r$  r  rI  return_dictr  )r*   r4   r/   r+   r,   r-   r.   r0   r1   r2   r3   r5   )rK   r  rI  rE  rB  r  rM   r  r  r  r  r0  allrX   rN   r   r  r-  r  r*   r(   r,   r-   r.   r+   )r\   rF  r  r   r  r   r,   rH  r  r$  r  rI  r  r  r   rF  backbone_outputsbackbone_hidden_statesr  backbone_logitsr*   r4   r/   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                                 r@   r   #CsmForConditionalGeneration.forward  s   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	  Y^^q%8 CC)=M */:M"8,FI.. 
)%+'/!5)
 
 "2!!48B>SV8W8W~ot4]k,,'=aPQ>Q'RS! $$Q1WoO .. &4;;KaKaekM "!Q(+t388R8@@J&,Z&8>]@Y@Y\]@]>]9]&^#&(mm&7&78OQW_`&7&a##++T+:J*@APZ[\P]`aPacdAd*e'#)*#5 $($6$61+F#"3%9 + %7 %! "7!;!; #55D '1",<<*88'22AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%E
 	
 qu
 	
r?   rk  rl  r|   rm  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ r  r  r  s              r@   rv  QCsmForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  r  r?   )r  r  r  r  r  r0  r  )NNNNNNNNNNNNr   )&r6   r7   r8   r9   r  rq   r>  rC  r  r  r  classmethodr  r  r   r;   r  rB  r  r   r<   r  r   r   r   r   r*  r  r   r  r   r(   r   r  r|   rv  r>   r   r   s   @r@   r  r  `  s    	@1

01&  41 -1/37;)-NBELL)NB u||,NB 'u||4	NB
 &NB 
%,,	NBf ,0595959## "% !!1!12	
   1 12 !!1!12 >  '+/3157;37KO59-1$(,0/35934f
##f
 u||,f
 !.	f

 'u||4f
 u//0f
 "%tE4E4E/F(F"GHf
   1 12f
 ))*f
 D>f
 $D>f
 'tnf
 !!1!12f
 c5<</0f
 *+f
  
u''	(!f
  f
P 444 4 {{	4
 4 4 4r?   r  )rC   r  r,  r  r  r  )rH   )Qdataclassesr   typingr   r   r   r   r   r;   torch.nnrN   activationsr
   cache_utilsr   r   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor    configuration_csmr"   r#   generation_csmr$   !torch.nn.attention.flex_attentionr%   integrations.flex_attentionr&   
get_loggerr6   r  r(   rC   ModulerZ   r   r   r   r   r  r  r   r   r   r   rF   r,  rW   r  r  r  r  r  __all__r5   r?   r@   <module>rp     s  , " 9 9   ! . ) 7 > B 9 O K F & u u  ? .  !!;J 
		H	% 76 76 76t 
 * * *B Y'J J (J(< <DRYY  (6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4J)299 J)Z20 2j F- F FRryy . ?,j > |!3_ ||~  {) { {| 
X"46H X
Xvr?   