
    fThJ                        S SK Jr  S SKJrJrJrJr  S SKrS SKJ	r	  SSK
JrJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  \RZ                  " \.5      r/\ " S S\5      5       r0\" SS9\ " S S\5      5       5       r1 " S S\&5      r2 " S S\'5      r3 " S S\$5      r4 " S S\!5      r5 " S  S!\"5      r6\ " S" S#\%5      5       r7 " S$ S%\	Rp                  5      r9\" S&S9 " S' S(\#\5      5       r: " S) S*\	Rp                  5      r;\ " S+ S,\%5      5       r<\" S-S9 " S. S/\1\,5      5       r=/ S0Qr>g)1    )	dataclass)ListOptionalTupleUnionN   )CacheDynamicCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging   )	AutoModel)KwargsForCausalLMLlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbedding   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   Sr\\R                     \	S
'   Sr\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Srg)CsmOutputWithPast/   a  
Base class for the model autoregressive outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss )__name__
__module____qualname____firstlineno____doc__r&   r   torchFloatTensor__annotations__r'   r(   r   r)   r*   r+   r,   r-   r.   r/   r0   __static_attributes__r1       [/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/csm/modular_csm.pyr$   r$   /   s(   *X )-D(5$$
%, $FE$AEOXeE%*;*;$<=>E=AM8E%"3"3S"89:A:>Ju00#567>6:!2!23:.2%++2OS!8E%8I8I2J,K#LSKO%0A0A30F*G!HOHLhuU->->-C'DEL15M8E--.5r;   r$   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    )custom_introc                   J    \ rS rSr\rSrSrS/rS/r	Sr
SrSrSrSrSrS rSrg)	CsmPreTrainedModelj   modelTCsmDecoderLayerr(   c                 T   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g [        U[        5      (       aI  UR                  n[        US-
  5       H*  nUR
                  R                  U   R                  SUS9  M,     g [        U[        5      (       a&  UR
                  R                  R!                  S5        g g )Ng        )meanstdr   g      ?)configinitializer_range
isinstancennLinearweightdatanormal_biaszero_	Embeddingpadding_idxCsmCodebooksHeadnum_codebooksrange
CsmRMSNormfill_)selfmodulerE   rS   is        r<   _init_weights CsmPreTrainedModel._init_weights   s=   kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . 011"00M=1,-""1%--3C-@ .
++MM$$S) ,r;   r1   N)r2   r3   r4   r5   r    config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendrZ   r:   r1   r;   r<   r?   r?   j   sQ     L&*#*+#4"5!N ! $!"&*r;   r?   c                       \ rS rSrSrg)rU      r1   Nr2   r3   r4   r5   r:   r1   r;   r<   rU   rU          r;   rU   c                       \ rS rSrSrg)CsmRotaryEmbedding   r1   Nri   r1   r;   r<   rl   rl      rj   r;   rl   c                       \ rS rSrSrg)CsmMLP   r1   Nri   r1   r;   r<   ro   ro      rj   r;   ro   c                       \ rS rSrSrg)CsmAttention   r1   Nri   r1   r;   r<   rr   rr      rj   r;   rr   c                       \ rS rSrSrg)rB      r1   Nri   r1   r;   r<   rB   rB      rj   r;   rB   c                   P  ^  \ rS rSr\rU 4S jr\\          SS\	R                  S\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\   S\\   S\\	R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )CsmDepthDecoderModel   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  -  UR                  5      U l        [        R                  " UR                  UR                  SS9U l
        g NF)rN   )super__init__rI   rP   rS   
vocab_sizebackbone_hidden_sizeembed_tokensrJ   hidden_sizeinputs_embeds_projectorrW   rF   	__class__s     r<   r|   CsmDepthDecoderModel.__init__   s]     LL&*>*>ARAR*RU[UpUpq')yy1L1LfN`N`gl'm$r;   	input_idsbackbone_last_hidden_stateattention_maskposition_idsr(   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statescache_positionflash_attn_kwargsreturnc                    Ub:  [         R                  R                  5       (       d  [        R	                  S5        SnUb  UOU R
                  R                  nU	b  U	OU R
                  R                  n	Ub  UOU R
                  R                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R	                  S5        SnU(       a  Uc
  [        5       nU
ci  Ub  UR                  5       OSnUb  UR                  S   OUR                  S   nUb  UR                  OUR                  n[         R                   " XU-   US9n
Uc  [         R"                  " U
S-
  SS	9nXR$                  -  nU R'                  UU-   5      nU
S   S:H  nUb	  X&SS2S4'   O?[         R                  R                  5       (       d  U(       a  [        R)                  S
5        U R+                  U5      nU R-                  X6XU5      nUnU
R/                  S5      nU R1                  UU5      nU	(       a  SOSnU(       a  SOSnU R2                  SU R
                  R4                    H8  nU	(       a  UU4-  nU" U4UUUUUU
US.UD6nUS   nU(       d  M/  UUS   4-  nM:     U R7                  U5      nU	(       a  UU4-  n[9        UU(       a  UOSUUS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.zX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.r1   )r   r   past_key_valuer   r   r   position_embeddings)last_hidden_stater(   r)   r*   )r7   compileris_compilingloggerwarning_oncerF   r   r   r   
ValueErrorgradient_checkpointingtrainingr
   get_seq_lengthshaper   arangeclampr}   r   warningr   _update_causal_mask	unsqueeze
rotary_emblayersnum_hidden_layersnormr   )rW   r   r   r   r   r(   r   r   r   r   r   r   past_seen_tokensinputs_seq_lengthr   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr)   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                            r<   forwardCsmDepthDecoderModel.forward   s   * #ENN,G,G,I,IM  L1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<Z[[&&4==Yj I0*nO!CRC^==?de:G:S 3 3A 6YbYhYhijYk-:-F]))IL\L\F"\\*:O`<`iopN !KK(:BM"__4F --i&.@AM+9!+<+A()5&@ad#~~22449UNN Q 44]C..>L]
 & &//2"oom\J #7BD0d![[)H4;;+H+HIM#!m%55!)
*)."3#-$7
 $
M *!,M  =#3"55' J* 		-0  -!11&+/8Od+%	
 	
r;   )r   r   )
NNNNNNNNNN)r2   r3   r4   r5   r!   r\   r|   r   r   r7   
LongTensorr   r8   Tensorr	   boolr   r   r   r   r   r   r:   __classcell__r   s   @r<   rw   rw      s*   (Ln
  '+BF1537+/59$(,0/359p
##p
 %-U->->$?p
 !.	p

 u//0p
 "%p
   1 12p
 D>p
 $D>p
 'tnp
 !!1!12p
 $$89p
 
u--	.p
  p
r;   rw   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )rR   i$  c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g )Nr   )r{   r|   rS   rI   	Parameterr7   emptyrK   )rW   r   rS   r}   r   s       r<   r|   CsmCodebooksHead.__init__%  s:    *ll5;;t/A/AA/E{#_`r;   c           
         Uc3  UR                   S   nU R                  [        R                  " U5         nOUS-
  nU R                  U   n[	        UR                   S   5       Vs/ s H9  n[
        R                  R                  US S 2US S 24   XF   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr   r   dim)
r   rK   r7   r   rT   rI   
functionallinearTstack)rW   r)   r   
seq_lengthcodebook_weightr   codebook_idxs          r<   r   CsmCodebooksHead.forward*  s    !&,,Q/J"kk%,,z*BCO*Q.M"kk-8O !&o&;&;A&> ?
 ? MM  q,/A!BODaDcDcd ? 	 
 Mq9
s   %A B>)rS   rK   Nr2   r3   r4   r5   r|   r   r:   r   r   s   @r<   rR   rR   $  s    a
 r;   rR   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c            !       d  ^  \ rS rSrSrSrSrU 4S jrS rS r	    SS\
R                  S\\   S\\
R                     S	\\
R                     S
\\
R                     4
U 4S jjjr\\            SS\
R                  S\\
R                     S\\
R$                     S\\
R                     S\\\\\
R                     4      S	\\
R                     S\\
R                     S\\   S\\   S\\   S
\\
R                     S\\\
R$                  4   S\\   S\\\4   4S jj5       5       rSrU =r$ )CsmDepthDecoderForCausalLMi;  Nc                    > [         TU ]  U5        U ?[        UR                  UR
                  UR                  5      U l        [        U5      U l	        g r   )
r{   r|   lm_headrR   r   rS   r}   codebooks_headrw   rA   r   s     r<   r|   #CsmDepthDecoderForCausalLM.__init__G  sE     L.v/A/A6CWCWY_YjYjk)&1
r;   c                     [        S5      eNzNot needed for CsmAttributeErrorrW   s    r<   get_output_embeddings0CsmDepthDecoderForCausalLM.get_output_embeddingsM      122r;   c                     [        S5      er   r   rW   new_embeddingss     r<   set_output_embeddings0CsmDepthDecoderForCausalLM.set_output_embeddingsP  r   r;   r   r(   r   r   r   c                    > [         T	U ]  " XX4U40 UD6nUS   S   S:H  nU(       d  UR                  S5        UR                  S5        U$ )Nr   r   r   r   )r{   prepare_inputs_for_generationpop)
rW   r   r(   r   r   r   kwargsmodel_inputsis_first_generation_stepr   s
            r<   r   8CsmDepthDecoderForCausalLM.prepare_inputs_for_generationS  sc     w<~
Y_
 $00@#A!#D#I '9: 	(r;   r   r   labelsr   r   r   logits_to_keepr   r   c                 n   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	U
US.
UD6nUS   n[	        U[
        5      (       a!  US:X  a  [        SS5      nO[        U* S5      nOUnU R                  USS2USS24   Ub  UU   OS5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " SUSU R                   R                  US.UD6n[        UUUR                  UR                  UR                  S9$ )	a  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)
r   r   r   r   r(   r   r   r   r   r   r   r   .)r'   r   r}   shift_labels)r&   r'   r(   r)   r*   r1   )rF   r   r   rA   rH   intslicer   
contiguousloss_functionr}   r   r(   r)   r*   )rW   r   r   r   r   r(   r   r   r   r   r   r   r   r   outputsr)   slice_indicesr'   r&   r   s                       r<   r   "CsmDepthDecoderForCausalLM.forwardi  sx   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ** 
'A)%+'/!5)
 
  
nc**" %a %~ot <*M$$!]A-.Q_Qk}0Mqu
 ""$!#qr'?557L%% dt{{7M7M\hlrD &#33!//))
 	
r;   )r   rA   NNNN)NNNNNNNNNNNr   )r2   r3   r4   r5   _tied_weights_keys_tp_plan_pp_planr|   r   r   r7   r   r   r	   r8   r   r   r   r   r   r   r   r   r   r   r   r   r   r:   r   r   s   @r<   r   r   ;  s    HH233 ,0595959## "% !!1!12	
   1 12 !!1!12 ,  '+BF1537KO59-1$(,0/35934J
##J
 %-U->->$?J
 !.	J

 u//0J
 "%tE4E4E/F(F"GHJ
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 !!1!12J
 c5<</0J
 *+J
 
u,,	-J
  J
r;   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CsmBackboneModelEmbeddingsi  c                   > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )Naudio_tokens_offsetsF)
persistent)r{   r|   rI   rP   rS   r}   r   embed_audio_tokensregister_bufferr7   r   r   s     r<   r|   #CsmBackboneModelEmbeddings.__init__  sn    "$,,0D0DvGXGX0X[a[m[m"n"ELL1E1E$FIZIZ$Zgl 	 	
r;   c                 ^    U R                  XR                  -   5      nUR                  SS9nU$ )Nr   r   )r   r   sum)rW   r   input_embedss      r<   r   "CsmBackboneModelEmbeddings.forward  s4    ..y;T;T/TU#''A'.r;   )r   r   r   s   @r<   r   r     s    
 r;   r   c                   H   ^  \ rS rSrU 4S jr\\U 4S j5       5       rSrU =r	$ )CsmBackboneModeli  c                 D   > [         TU ]  U5        [        U5      U l        g r   )r{   r|   r   r   r   s     r<   r|   CsmBackboneModel.__init__  s     6v>r;   c                 $   > [         TU ]  " S0 UD6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
r1   )r{   r   )rW   super_kwargsr   s     r<   r   CsmBackboneModel.forward  s     w...r;   )r   )
r2   r3   r4   r5   r|   r   r   r   r:   r   r   s   @r<   r  r    s$    ? /  /r;   r  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c            #         ^  \ rS rSrSS/rU 4S jrS rS rS rS r	S	 r
\U 4S
 j5       rU 4S jr    S$S\\R                      S\\R                      S\\R                      S\\R                      S\\R                      4
S jjr    S$S\R$                  S\\   S\\R$                     S\\R(                     S\\R$                     4
U 4S jjjr\\             S%S\R$                  S\\R                      S\\R                      S\\R                      S\\R$                     S\\\\\R(                     4      S\\R(                     S\\R$                     S\\   S\\   S\\   S\\R$                     S\\\R                   4   S\\   S\\\4   4S jj5       5       r \!S\R                   S\S\S \RD                  S\R                   S!\4S" j5       r#S#r$U =r%$ )&CsmForConditionalGenerationi  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g rz   )r{   r|   r}   rI   rJ   r   r   rP   text_vocab_sizeembed_text_tokensr  _from_configbackbone_modelr   depth_decoder_configdepth_decoderr   from_configcodec_configcodec_model	post_initr   s     r<   r|   $CsmForConditionalGeneration.__init__  s      ++yy!3!3V5F5FUS!#f.D.DfFXFX!Y.;;FC7DDVE`E`a$001D1DE 	r;   c                 .    U R                   R                  $ r   r  r   r   s    r<   get_input_embeddings0CsmForConditionalGeneration.get_input_embeddings  s    ""///r;   c                 $    XR                   l        g r   r  )rW   values     r<   set_input_embeddings0CsmForConditionalGeneration.set_input_embeddings  s    +0(r;   c                     U R                   $ r   r   r   s    r<   r   1CsmForConditionalGeneration.get_output_embeddings  s    ||r;   c                     Xl         g r   r  r   s     r<   r   1CsmForConditionalGeneration.set_output_embeddings  s    %r;   c                     U R                   R                  (       aO  U R                  U R                  R                  R
                  U R                  R                  R                  5        g g r   )rF   tie_codebooks_embeddings_tie_or_clone_weightsr  r   r   r  rA   r   s    r<   _tie_weights(CsmForConditionalGeneration._tie_weights  sL    ;;//&&##00CC""((55 0r;   c                    > UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 s H"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )Noutput_loading_infoFdepth_decoder__from_model_config)getr{   from_pretrainedlenvarsgeneration_configitems
startswithr  updatedelattr)clsargsr   rA   loading_infoprefix
prefix_lenattrr  depth_decoder_attrsr   s             r<   r-  +CsmForConditionalGeneration.from_pretrained  s   ::+U33"''"94"J6"JE<G+T<V<E "[
  $E$;$;<BBD
Dv& %Du$D 	 
 	U  223::<PRW;o[n;op (DE++V]; ( !F*,&&L
s   /C:	C:c                    > SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr*  transformers_version)r  r0  to_diff_dictr   r1  setattrr{   save_pretrained)rW   r6  r   r8  r;  r:  r  r   s          r<   rA  +CsmForConditionalGeneration.save_pretrained#  sq    !"00BBOOQ 6=.446KDD**FM5A 7 	00r;   r   input_valuesinput_values_cutoffsr   r   c                    U R                  U5      nUGb+  [        R                  R                  US5      nX3S:     R	                  5       nXfS:     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n/ n[        X#5       H  u  pXS:     n
[        U
R                  S   S-
  5       Hp  nX   nXS-      nU	SX24   nU R                  R!                  UR                  S5      5      nUR"                  R%                  SS5      nUR'                  US   5        Mr     M     [        S U 5       5      n[
        R(                  " U Vs/ s H7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                  R+                  U5      nU R,                  R.                  nUU:H  nU R0                  R3                  U5      nUU   UU'   [
        R4                  " SSU R,                  R6                  4UR                  [
        R8                  S	9U R,                  R:                  -  nU R0                  R3                  U5      R=                  S5      nXR,                  R>                  :H  nURA                  URC                  5       S5      UU'   Ubb  UR                  S5      RA                  SSU R,                  R6                  5      nUU   UU'   US
:H  RE                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf )a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retreive codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
Nr   r   r   r   r   .c              3   >   #    U  H  oR                   S    v   M     g7f)r   N)r   ).0els     r<   	<genexpr>QCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>\  s     "K9J288A;9Js   )r   dtypeiTas_tuple)r   r   )#r  rI   r   paddiffr7   r   maxr   expandr.  r   ziprT   r   r  encodeaudio_codes	transposeappendr   get_audio_codes_maskrF   audio_token_idr  r   onesrS   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr   nonzero)rW   r   rC  rD  r   r   audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsrY   	start_idxend_idxaudio_batchcodec_outputscodebook_idsmax_audio_framesrJ  batched_audio_token_idsaudio_codes_maskr[  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxss                                r<   "_merge_input_ids_with_input_values>CsmForConditionalGeneration._merge_input_ids_with_input_values-  sQ   * ..y9##%==#4#45I6#R 01JKPPRM)!*;<M %-A-E-E-GP\PcPc d k kM"B! !24K4KA4N N
 !#BElBi>"-GfgHg-h*9??BQFGA : =I8Q?G"4S):K5K"LK$($4$4$;$;K<Q<QRS<T$UM#0#<#<#F#Fq"#ML%,,\!_= H Cj  #"K9J"KK&+kk\mn\mVX""21a1ABHHQK1O'PQ\mn'#  $//DDEVW![[77N(N:..;;<STL.:;K.LM*+ 

Aq$++";";<YEUEU]b]g]gh++334    $22??@ST\\]^_#,0N0N#N 2B2I2IJ^JbJbJdfg2hM./ !"("2"22"6"="=aDKKD]D]"^4KL\4] 014:dN3K3KUY3K3Z0pt @ CEefgEhjkjl lm(!.AA; os   =>Mr(   r   r   r   c           	      2  > [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)r   r(   r   r   r   r   r   rC  rD  r   )r   rC  rD  r   )r   r   r   r1   )r{   r   ndimr,  rw  r3  )
rW   r   r(   r   r   r   r   r   merged_inputsr   s
            r<   r   9CsmForConditionalGeneration.prepare_inputs_for_generation}  s     w< 
+)')
 
  Y^^q%8\=M=Mo=^=f CC##ZZ7%+ZZ0F%Gzz(+	 D M "/"@MZbLcrvw r;   r   r   r   r   r   r   c                 L   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb.  UR                  S:X  a  U R	                  XXH5      nUS   nUS   nSnU R
                  " SUUUUUU	U
UUS.	UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnSnSnSnUb  USS2SS2S4   nU R                  " SUUU R                   R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                   R                  S-
  24   n[        R                  R!                  USSS9nUR#                  SS9nUUS   US   S-
  SS24   nUU   nU R%                  UUU	U
USUS9nUR&                  nUU-   n[)        UUUUUR*                  UR,                  UR.                  Ub  UR0                  OSUb  UR*                  OSUb  UR,                  OSUb  UR.                  S9$ SS9$ )a_  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be infered from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "eustlb/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr   r   r   )	r   r   r   r(   r   r   r   r   r   r   )r'   r   r}   r   rP  rG  r   .rF  )r  TrN  )r   r   r   r   r   return_dictr   )r&   r0   r+   r'   r(   r)   r*   r,   r-   r.   r/   r1   )rF   r   r   rz  rw  r  rH   r   r   r   r   r}   allrS   rI   r   rQ  rb  r  r&   r$   r(   r)   r*   r'   )rW   r   rC  r   rD  r   r(   r   r   r   r   r   r   r   r   r{  backbone_outputsbackbone_hidden_statesr   backbone_logitsr&   r0   r+   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                                 r<   r   #CsmForConditionalGeneration.forward  s   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	  Y^^q%8 CC)=M */:M"8,FI.. 
)%+'/!5)
 
 "2!!48B>SV8W8W~ot4]k,,'=aPQ>Q'RS! $$Q1WoO .. &4;;KaKaekM "!Q(+t388R8@@J&,Z&8>]@Y@Y\]@]>]9]&^#&(mm&7&78OQW_`&7&a##++T+:J*@APZ[\P]`aPacdAd*e'#)*#5 $($6$61+F#"3%9 + %7 %! "7!;!; #55D '1",<<*88'22AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%E
 	
 qu
 	
r;   sequence_lengthtarget_lengthrM  
batch_sizec                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerM  r   r   )diagonalr   rG  r   )r   r7   finfor   fullr   triur   reshaperT  cloner   tomasked_fill)r   r  r  rM  r   r  r   r   	min_dtypemask_lengthpadding_masks              r<   5_prepare_4d_causal_attention_mask_with_cache_positionQCsmForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_positionF  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r;   )r  r  r  r  r   r}   r   )NNNNNNNNNNNNr   )&r2   r3   r4   r5   r   r|   r  r  r   r   r&  classmethodr-  rA  r   r7   r   rw  r   r	   r8   r   r   r   r   r   r   r   r   r   r   r$   r   staticmethodrM  r  r:   r   r   s   @r<   r	  r	    s    	@1

01&  41 -1/37;)-NBELL)NB u||,NB 'u||4	NB
 &NB 
%,,	NBf ,0595959## "% !!1!12	
   1 12 !!1!12 >  '+/3157;37KO59-1$(,0/35934f
##f
 u||,f
 !.	f

 'u||4f
 u//0f
 "%tE4E4E/F(F"GHf
   1 12f
 ))*f
 D>f
 $D>f
 'tnf
 !!1!12f
 c5<</0f
 *+f
  
u''	(!f
  f
P 444 4 {{	4
 4 4 4r;   r	  )r?   r  rw   r   r	  )?dataclassesr   typingr   r   r   r   r7   torch.nnrI   cache_utilsr	   r
   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   autor   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_csmr    r!   generation_csmr"   
get_loggerr2   r   r$   r?   rU   rl   ro   rr   rB   rw   ModulerR   r   r   r  r	  __all__r1   r;   r<   <module>r     s    " / /   . ) B O - & K K 	 	 	 @ . 
		H	% 76 76 76t 
 * * *D	 		- 		X 		> 		' 	 z
: z
 z
zryy . s
!1? s
s
l  /z / /. 
X"46H X
Xvr;   