
    fThod                        S SK r S SKJr  S SKJrJrJrJrJrJ	r	  S SK
r
S SKJr  SSKJrJrJrJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  \(       a  SS
KJr  \R:                  " \5      r\ " S S\5      5       r  " S S\5      r!g)    N)	dataclass)TYPE_CHECKINGDictListOptionalTupleUnion   )GenerateDecoderOnlyOutputGenerationConfigGenerationMixinGenerationMode)LogitsProcessorList)MaxLengthCriteriaStoppingCriteriaList)GenerateNonBeamOutput)logging)BaseStreamerc                   H    \ rS rSr% SrSr\\\R                        \
S'   Srg)CsmGenerateOutput*   a  
Outputs of CsmForConditionalGeneration.generate.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
        Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
        Returns the model cache, used to speed up decoding. Different models have a different cache format, check
    audio (`list(torch.FloatTensor)` of length `batch_size`):
        The generated audio.
Naudio )__name__
__module____qualname____firstlineno____doc__r   r   r   torchTensor__annotations____static_attributes__r       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/csm/generation_csm.pyr   r   *   s!    6 +/E8D&'.r#   r   c                     ^  \ rS rSrS\4U 4S jjr SS\\   S\\   S\	S\
\\	4   4U 4S jjjrS\R                  S	\S
\S\S\S\S   S\\\R                  4   4S jr         SS\\R$                     S\\R$                     S\\R$                     S\\   S	\\   S
\\   S\\   S\S   S\\   S\\\R                  4   4U 4S jjjrSrU =r$ )CsmGenerationMixinJ   returnc                    > [         TU ]  " U0 UD6n[        5       nU HX  n[        U[        5      (       d/  [
        R                  SUR                  R                   S35        MG  UR                  U5        MZ     U$ )NzCsm does not support z' stopping criteria, it will be ignored.)
super_get_stopping_criteriar   
isinstancer   loggerwarning	__class__r   append)selfargskwargscriteriakept_criteria	criterionr/   s         r$   r+   )CsmGenerationMixin._get_stopping_criteriaK   sw    
 714B6B,.!Ii):;;+I,?,?,H,H+IIpq $$Y/ " r#   generation_configuse_model_defaultsr3   c           	      |  >^ UR                  5        VVs0 s H,  u  pEUR                  S5      (       d  M  U[        S5      S U_M.     nnnUR                  5        VVs0 s H  u  pEUR                  S5      (       a  M  XE_M!     nnn[        TU ]  " X40 UD6u  pU R
                  R                  R                  " S0 UD6  [        U R
                  R                  S5      =(       d    U R                  R                  S-
  n[        U R
                  R                  S5      =(       d    U R                  R                  S-
  n	X1U R                  R                  S-
  1:w  a,  [        SU SU	 SU R                  R                  S-
   S	35      eU R
                  R                  R                  (       a0  [        R                  S
5        SU R
                  R                  l        XR
                  R                  l        XR
                  R                  l        UR"                  mSU4S jjn
Xl        X4$ s  snnf s  snnf )z
This method overrides [~generation.utils.GenerationMixin._prepare_generation_config].
It ensures that the depth decoder generation config is initialized and that passed args as depth_decoder_* are properly handled.
depth_decoder_Nmin_new_tokens   max_new_tokensz2depth_decoder_generation_config's min_new_tokens (z) and max_new_tokens (z2) must be equal to self.config.num_codebooks - 1 ()zdepth_decoder_generation_config.return_dict_in_generate is set to True, but this will be ignored as the depth decoder model does not return a dictionary in generateFc                 ~   > T" U 5      nU[         R                  [         R                  4;  a  [        SU S35      eU$ )NzGeneration mode zg is not supported for CSM model. Please set generation parameters to use greedy or sampling generation.)r   GREEDY_SEARCHSAMPLE
ValueError)assistant_modelgeneration_modeoriginal_get_generation_modes     r$   patched_get_generation_modeRCsmGenerationMixin._prepare_generation_config.<locals>.patched_get_generation_mode   sO    :?KO~'C'C^EZEZ&[[ &&7  8_  `  #"r#   r   N)items
startswithlenr*   _prepare_generation_configdepth_decoderr8   updategetattrconfignum_codebooksrC   return_dict_in_generater-   r.   r<   r>   get_generation_mode)r1   r8   r9   r3   kvdepth_decoder_kwargsmodel_kwargsdepth_decoder_min_new_tokensdepth_decoder_max_new_tokensrG   rF   r/   s              @r$   rM   -CsmGenerationMixin._prepare_generation_config\   sL    7=lln 
6DdaUeHf)Ac"#%&)n 	  

 $*<<>X>41FV9W$!$>X +0'*L+
5;+
' 	,,33K6JK (/t/A/A/S/SUe'f (
KK%%) 	% (/t/A/A/S/SUe'f (
KK%%) 	% )GDKKLeLehiLiKjjDEaDbbx  zV  yW  WI  JN  JU  JU  Jc  Jc  fg  Jg  Ih  hi  j  11IINN w LQD00H>Z,,;>Z,,; (9'L'L$	# 1L- ..a 

 Ys   H2H2H8?H8	input_idslogits_processorstopping_criteriasynced_gpusstreamerr   c           
      
   U R                   R                  nUR                  SLn	UR                  n
UR                  nUR
                  nUR                  nUR                  nUR                  nU(       a	  U(       a  SOSnU(       a	  U(       a  SOSnU(       a	  U
(       a  SOSnU(       a	  U(       a  SOSnUR                  SS u  nnSn[        R                  " U[        R                  UR                  S9nU R                  UUR                  U5      nUR                  S:X  aG  UR!                  S5      c5  U H/  n[#        U[$        5      (       d  M  U=R&                  U-  sl        M1     U R(                  nU R+                  Xt5      nU(       a.  S[,        R.                  S'   U R1                  UR2                  5      nS	nU R5                  UUUR                  S
9(       Ga@  U R6                  " U40 UD6nUR9                  U
(       a  SU
0O0 5        UR9                  SS	05        U(       a  U " S0 UDSS	0D6nSnOU" S0 UDSS	0D6nU R;                  UU5      nU(       a	  U(       a  M  UR<                  SS2SSS24   R?                  5       RA                  5       nURC                  UR                  5      nU" UU5      nU(       aH  U(       a  UU4-  nU(       a  UU4-  nU
(       a  UURD                  4-  nU(       a  UURF                  4-  nU(       aC  [H        RJ                  RM                  USS9n [        RN                  " U SS9RQ                  S5      n!O[        RR                  " USS9n!U!SS2S4   n"[H        RJ                  RU                  U"SSS9n#URF                  S   SS2SSS24   n$U RV                  RY                  U#U$R?                  5       S9n%[#        U%[        RZ                  5      (       a  U%OU%R\                  n&U&SS2SS24   n&U&n!U	(       a,  U!UR_                  S5      -  USUR_                  S5      -
  -  -   n!UR                  S:X  a  U!SS2SSS24   nO![        R`                  " UU!SS2SSS24   /SS9nUb  URc                  U!Re                  5       5        UUSS2SSS24   U R                   Rf                  :H  Ri                  S5      ) -  nUU" UU5      ) -  nURk                  5       S:H  nUS-  nAA%U R5                  UUUR                  S
9(       a  GM@  Ub  URm                  5         U(       a  [o        UUUUUUR!                  S5      S9$ U$ )a  
This method overrides [~generation.utils.GenerationMixin._sample].
To ease maintenance, modifications are marked with the comment "Csm specific".

Indeed, Csm model requires a custom generation sampling step:
1. Infer the backbone model to sample the first codebook token
2. Call generate on the depth decoder with the first codebook token as input_ids to sample the next codebook tokens
3. Use these generated codebook tokens as input_ids to sample the next first codebook token using the backbone model
4. Repeat until stopping criteria is met

Csm supports two stopping criterias:
- stop when the generated sequence is at max_length
- stop when all the generated codebook tokens are the codebook_eos_token_id
Nr      F)dtypedeviceinputs_embeds0TOKENIZERS_PARALLELISMT)rd   output_attentionsoutput_hidden_statesreturn_dictdimr=   )num_samples)r=   r   r   )value)r\   backbone_last_hidden_statepast_key_values)	sequencesscoreslogits
attentionshidden_statesrq   )8rQ   codebook_pad_token_id_eos_token_tensorrh   ri   output_scoresoutput_logitsrS   	do_sampleshaper   oneslongrd   _get_initial_cache_positionndimgetr,   r   
max_length__call___valid_auto_compile_criteriaosenvironget_compiled_callcompile_config_has_unfinished_sequencesprepare_inputs_for_generationrO   #_update_model_kwargs_for_generationrt   clonefloattoru   rv   nn
functionalsoftmaxmultinomialsqueezeargmaxpadrN   generater    rr   	unsqueezecatputcpucodebook_eos_token_idallmaxendr   )'r1   r\   r]   r^   r8   r_   r`   rX   pad_token_idhas_eos_stopping_criteriarh   ri   ry   rz   rS   r{   rs   
raw_logitsdecoder_attentionsdecoder_hidden_states
batch_sizecur_lenthis_peer_finishedunfinished_sequencesr6   model_forwardcompile_forward
is_prefillmodel_inputsoutputsnext_token_logitsnext_token_scoresprobsnext_tokensfirst_codebook_idsdepth_decoder_input_idsrp   depth_decoder_outputscodebook_idss'                                          r$   _sampleCsmGenerationMixin._sample   sk   4 {{88$5$G$Gt$S!-??0EE)77)77"3"K"K%//	 0M3RD
$;@QRX\'>CW^b (oobq1
G"$zz*EJJyO_O_`77AQAQS_` >>Q<#3#3O#D#L /	i):;;((G3( /
 ;;L\36BJJ/0 223D3S3STM
,,## - 
  ==iX<XL L]!46G Hcef!7 >? @@4@"
'I,IDI  CCL 1 !(q"ax 8 > > @ F F H 1 4 4Y5E5E F !1<M N ' 022F #4"66J$&7+=+=*??&')g.C.C-EE) --.?R-H#//1EMMaP#ll+<"E "-QW!5&(mm&7&78JFZ[&7&\#)0)>)>r)B1b!8)L&$($6$6$?$?1NhNnNnNp %@ %!
 3U\\BB &*44  (12.L&K )),@,J,J2,NNQ],66r::R 
 ~~"'4
3	!IIy+aqj2I&JPQR	 #[__./ $8!R"*%)J)JJc"g; $  $8;LYX^;_:_#_ !5!9!9!;q!@qLG  &U ,,## - 
 
Z LLN",#!-3 , 0 01B C  r#   input_valuesinput_values_cutoffsoutput_audioc
                   > [         TU ]  " S	UUUUUUUUS.U
D6n[        U[        R                  5      (       + nSnU	(       Ga  U(       a  UR
                  OUn/ n[        R                  " 5          U H  nXR                  R                  :H  R                  SS9R                  5       nUR                  5       S:w  a  UR                  5       nOUR                  S   nUSU nU R                  R                  UR!                  SS5      R#                  S5      5      nUR%                  UR&                  S   5        M     SSS5        U(       a  [)        S	SU0UD6$ U	(       a  U$ U$ ! , (       d  f       N-= f)
a  
This method overrides [`~generation.utils.GenerationMixin.generate`] to match the specifics of the Csm model.
Indeed, Csm model requires a custom generation sampling step:
1. Infer the backbone model to sample the first codebook token
2. Call generate on the depth decoder with the first codebook token as `input_ids` to sample the next codebook tokens
3. Use these generated codebook tokens as `input_ids` to sample the next first codebook token using the backbone model
4. Repeat until stopping criteria is met

<Tip warning={true}>

Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
model's default generation configuration. You can override any `generation_config` by passing the corresponding
parameters to generate(), e.g. `.generate(inputs, do_sample=True)`.
</Tip>

Parameters:
    inputs_ids (`torch.Tensor` of shape (batch_size, seq_length), *optional*):
        The sequence used as a prompt for the backbone model.
    input_values (`torch.Tensor` of shape (batch_size, channels, max_concatenated_audio_length), *optional*):
        The batched audio input values, where each batch entry contains the concatenation of all audio segments for that entry.
        These values will be encoded into codebook tokens using the codec model and merged with the text input ids provided in `input_ids`.
    input_values_cutoffs (`torch.Tensor` of shape (batch_size, max_num_audio), *optional*):
        Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
        If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
        where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
        the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
    generation_config ([`~generation.GenerationConfig`], *optional*):
        The generation configuration to be used as base parametrization for the generation call. `**kwargs`
        passed to generate matching the attributes of `generation_config` will override them. If
        `generation_config` is not provided, the default will be used, which has the following loading
        priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
        configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
        default values, whose documentation should be checked to parameterize generation.
    logits_processor (`LogitsProcessorList`, *optional*):
        Custom logits processors that complement the default logits processors built from arguments and
        generation config. If a logit processor is passed that is already created with the arguments or a
        generation config an error is thrown. This feature is intended for advanced users.
    stopping_criteria (`StoppingCriteriaList`, *optional*):
        Custom stopping criteria that complements the default stopping criteria built from arguments and a
        generation config. If a stopping criteria is passed that is already created with the arguments or a
        generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
        sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
        intended for advanced users.
    synced_gpus (`bool`, *optional*):
        Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
        to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
        deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
    streamer (`BaseStreamer`, *optional*):
        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
    output_audio (`bool`, *optional*):
        Whether to return the generated audio.
    kwargs (`Dict[str, Any]`, *optional*):
        Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
        forwarded to the `forward` function of the model. Depth decoder specific kwargs should be prefixed with *depth_decoder_*.

Return:
    [`CsmGenerateOutput`] or `torch.LongTensor` or `List[torch.FloatTensor]`: A [`CsmGenerateOutput`]
    (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.LongTensor` when `output_audio=False`
    or a `List[torch.FloatTensor]` otherwise.

Example:

```python
>>> from transformers import CsmProcessor, CsmForConditionalGeneration
>>> from datasets import load_dataset, Audio

>>> model_id = "eustlb/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> # text prompt
>>> conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> audio = model.generate(**inputs, output_audio=True)
>>> processor.save_audio(audio, "output.wav")
```
)r\   r   r   r8   r]   r^   r_   r`   Nrk   rl   r   r=   )r   r   r   r   )r*   r   r,   r   r    rr   no_gradrQ   r   r   nonzeronumelminr|   codec_modeldecode	transposer   r0   audio_valuesr   )r1   r\   r   r   r8   r]   r^   r_   r`   r   r3   generate_outputgenerate_returned_dictr   generated_audio_codesaudio_codes_batcheos_idxs
cutoff_idxcodec_decode_outputr/   s                      r$   r   CsmGenerationMixin.generateR  sg   b  '* 

%!5/-/#

 

 &0%N!NAWO$=$=]l! E *?% 1[[5V5V V[[`b[ckkmH~~'1,%-\\^
%6%<%<Q%?
(9+:(F%*.*:*:*A*ABSB]B]^_abBcBmBmnoBp*q'LL!4!A!A$!GH *?	 !  "$D5DODDL""+ !s   0CE%%
E3r   rI   )	NNNNNNNNF)r   r   r   r   r   r+   r   r   boolr   r   rM   r   
LongTensorr   r	   r   r   r    r   r"   __classcell__)r/   s   @r$   r&   r&   J   s    
	$ cg8/!)*:!;8/QYZ^Q_8/rv8/	%	&8/ 8/tz##z .z 0	z
 ,z z >*z 
$e&6&66	7z| -1/37;8<:><@&*-1',Y#ELL)Y# u||,Y# 'u||4	Y#
 $$45Y# ##67Y# $$89Y# d^Y# >*Y# tnY# 
$e&6&66	7Y# Y#r#   r&   )"r   dataclassesr   typingr   r   r   r   r   r	   r   torch.nnr   
generationr   r   r   r   generation.logits_processr   generation.stopping_criteriar   r   generation.utilsr   utilsr   generation.streamersr   
get_loggerr   r-   r   r&   r   r#   r$   <module>r      su     
 ! D D    = S 5  4 
		H	% /1 / />a# a#r#   