
    fThn                     ~   S r SSKrSSKJr  SSKJr  SSKrSSKrSSK	J
r
  SSKJ
s  Jr  SSKJr  SSKJrJr  SS	KJr  \ " S
 S\5      5       r\ " S S\5      5       r\ " S S\5      5       r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r " S S\
R0                  5      r \ " S  S!\5      5       r!\" S"S#9 " S$ S%\!5      5       r"S%S!/r#g)&zTransformers DAC model.    N)	dataclass)Optional   )PreTrainedModel)ModelOutputauto_docstring   )	DacConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   S	rg)
	DacOutput   a  
Args:
    loss (`torch.Tensor`):
        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
    audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
        Reconstructed audio data.
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Quantized continuous representation of input.
    audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
        Codebook indices for each codebook (quantized discrete representation of input).
    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
        Projected latents (continuous representation of input before quantization).
Nlossaudio_valuesquantized_representationaudio_codesprojected_latents )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   
LongTensorr   __static_attributes__r       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/dac/modeling_dac.pyr   r      st     )-D(5$$
%,04L(5,,-4<@hu'8'89@.2K%**+259x 1 129r   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Srg)	DacEncoderOutput6   a  
Args:
    loss (`torch.Tensor`):
        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
        Quantized continuous representation of input.
    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
        Codebook indices for each codebook (quantized discrete representation of input).
    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
        Projected latents (continuous representation of input before quantization).
Nr   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r!   6   s_    
 )-D(5$$
%,<@hu'8'89@/3K%++,359x 1 129r   r!   c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)DacDecoderOutputJ   z
Args:
    audio_values (`torch.FloatTensor`  of shape `(batch_size, input_length)`, *optional*):
        Decoded audio values, obtained using the decoder part of Dac.
Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r$   r$   J   s     15L(5,,-4r   r$   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )Snake1dV   z3
A 1-dimensional Snake activation function module.
c                    > [         TU ]  5         [        R                  " [        R
                  " SUS5      5      U l        g )Nr	   )super__init__nn	Parameterr   onesalpha)self
hidden_dim	__class__s     r   r+   Snake1d.__init__[   s+    \\%**Q
A">?
r   c                    UR                   nUR                  US   US   S5      nXR                  S-   R                  5       [        R
                  " U R                  U-  5      R                  S5      -  -   nUR                  U5      nU$ )Nr   r	   g&.>   )shapereshaper/   
reciprocalr   sinpow)r0   hidden_statesr7   s      r   forwardSnake1d.forward_   s    ##%--eAha"E%d):(F(F(H599UYU_U_boUoKpKtKtuvKw(ww%--e4r   )r/   )	r   r   r   r   r   r+   r=   r   __classcell__r2   s   @r   r'   r'   V   s    @ r   r'   c                   @   ^  \ rS rSrSrS\4U 4S jjrS rS rSr	U =r
$ )DacVectorQuantizeg   a  
Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)

Additionally uses following tricks from improved VQGAN
(https://arxiv.org/pdf/2110.04627.pdf):
    1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
        for improved codebook usage
    2. l2-normalized codes: Converts euclidean distance to cosine similarity which
        improves training stability
configc                 >  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l
        g )Nr	   kernel_size)r*   r+   r,   Conv1dhidden_sizecodebook_dimin_projout_proj	Embeddingcodebook_sizecodebookr0   rD   r2   s     r   r+   DacVectorQuantize.__init__s   sn    yy!3!3V5H5HVWX		&"5"5v7I7IWXYV%9%96;N;NOr   c                 ,   U R                  U5      nU R                  U5      u  p4[        R                  " X#R	                  5       SS9n[        R                  " X2R	                  5       SS9nX#U-
  R	                  5       -   nU R                  U5      nX5XdU4$ )a  
Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.

Args:
    hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
        Input tensor.

Returns:
    quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
        Quantized continuous representation of input.
    commitment_loss (`torch.FloatTensor`of shape `(1)`):
        Commitment loss to train encoder to predict vectors closer to codebook entries.
    codebook_loss (`torch.FloatTensor`of shape `(1)`):
        Codebook loss to update the codebook.
    audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
        Codebook indices for each codebook, quantized discrete representation of input.
    projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
        Projected latents (continuous representation of input before quantization).
mean)	reduction)rK   decode_latentsFmse_lossdetachrL   )r0   hidden_stater   r   r   commitment_losscodebook_losss          r   r=   DacVectorQuantize.forwardz   s    * !LL6040C0CDU0V- **%68W8W8Yekl

#;=U=U=Wcij#4Sd8d7l7l7n#n #'==1I#J '-Vgggr   c                 n   UR                   u  p#nUR                  SSS5      R                  X$-  U5      nU R                  R                  n[
        R                  " U5      n[
        R                  " U5      nUR                  S5      R                  SSS9nUSU-  UR                  5       -  -
  * UR                  S5      R                  SSS9R                  5       -   nUR                  S5      S   n	U	R                  UR                  S5      S5      n	U R                  U	5      R                  SS5      n
X4$ )Nr   r6   r	   T)keepdimr5   )r7   permuter8   rO   weightrV   	normalizer;   sumtmaxsize	transpose)r0   r<   
batch_sizer1   sequence_length	encodingsrO   l2_normdistindicesr   s              r   rU    DacVectorQuantize.decode_latents   s   2?2E2E/
!))!Q2:::;WYcd	=='' KK	*	;;x( --"&&q$&71y=8::<7788<<?;N;NqZ^;N;_;a;a;cc((1+a.//-"4"4Q"7<#'==#9#C#CAq#I '00r   )rO   rK   rL   )r   r   r   r   r   r
   r+   r=   rU   r   r?   r@   s   @r   rB   rB   g   s'    	Py Ph@1 1r   rB   c                   B   ^  \ rS rSrSrSS\S\4U 4S jjjrS rSrU =r	$ )	DacResidualUnit   zY
A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
	dimensiondilationc                    > [         TU ]  5         SU-  S-  n[        U5      U l        [        R
                  " XSX#S9U l        [        U5      U l        [        R
                  " XSS9U l        g )N   r6      )rG   rr   paddingr	   rF   )	r*   r+   r'   snake1r,   rH   conv1snake2conv2)r0   rq   rr   padr2   s       r   r+   DacResidualUnit.__init__   s[    !a'i(YYyXc
i(YYyC
r   c                     UnU R                  U R                  U5      5      nU R                  U R                  U5      5      nUR                  S   UR                  S   -
  S-  nUS:  a	  USX3* 24   nX-   nU$ )a2  
Forward pass through the residual unit.

Args:
    hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
        Input tensor .

Returns:
    output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
        Input tensor after passing through the residual unit.
r5   r6   r   .)rx   rw   rz   ry   r7   )r0   rY   output_tensorrv   s       r   r=   DacResidualUnit.forward   s     %

4;;}#=>

4;;}#=>%%b)M,?,?,CCIQ;'WX-=(=>L$4r   )rx   rz   rw   ry   )   r	   )
r   r   r   r   r   intr+   r=   r   r?   r@   s   @r   ro   ro      s.    D# Dc D D r   ro   c                   F   ^  \ rS rSrSrS	S\S\S\4U 4S jjjrS rSr	U =r
$ )
DacEncoderBlock   z"Encoder block used in DAC encoder.rD   stridestride_indexc           
      L  > [         TU ]  5         UR                  SU-  -  n[        US-  SS9U l        [        US-  SS9U l        [        US-  SS9U l        [        US-  5      U l        [        R                  " US-  USU-  U[        R                  " US-  5      S9U l        g )Nr6   r	   rr   r   	   rG   r   rv   )r*   r+   encoder_hidden_sizero   	res_unit1	res_unit2	res_unit3r'   rw   r,   rH   mathceilrx   )r0   rD   r   r   rq   r2   s        r   r+   DacEncoderBlock.__init__   s    ..L@	(a!D(a!D(a!Di1n-YYNI1v:fVZV_V_`fij`jVk

r   c                     U R                  U5      nU R                  U5      nU R                  U R                  U5      5      nU R	                  U5      nU$ N)r   r   rw   r   rx   r0   rY   s     r   r=   DacEncoderBlock.forward   sI    ~~l3~~l3{{4>>,#?@zz,/r   )rx   r   r   r   rw   r	   r	   r   r   r   r   r   r
   r   r+   r=   r   r?   r@   s   @r   r   r      s/    ,

y 

# 

 

 

 r   r   c                   F   ^  \ rS rSrSrS	S\S\S\4U 4S jjjrS rSr	U =r
$ )
DacDecoderBlock   z"Decoder block used in DAC decoder.rD   r   r   c           
      X  > [         TU ]  5         UR                  SU-  -  nUR                  SUS-   -  -  n[        U5      U l        [
        R                  " UUSU-  U[        R                  " US-  5      S9U l	        [        USS9U l        [        USS9U l        [        USS9U l        g )Nr6   r	   r   r   r   r   )r*   r+   decoder_hidden_sizer'   rw   r,   ConvTranspose1dr   r   conv_t1ro   r   r   r   )r0   rD   r   r   	input_dim
output_dimr2   s         r   r+   DacDecoderBlock.__init__   s    ..!\/A	//19I3JJ
i())F
IIfqj)
 )a@(a@(a@r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )rw   r   r   r   r   r   s     r   r=   DacDecoderBlock.forward   sN    {{<0||L1~~l3~~l3~~l3r   )r   r   r   r   rw   r   r   r@   s   @r   r   r      s4    ,Ay A# A A A$ r   r   c                      ^  \ rS rSrSrS\4U 4S jjrSS\\   4S jjr	S\
R                  4S jrS	\
R                  4S
 jrSrU =r$ )DacResidualVectorQuantizei	  z|
ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://arxiv.org/abs/2107.03312)
rD   c                   > [         TU ]  5         UR                  nUR                  nX l        [        R
                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        X0l        g s  snf r   )	r*   r+   n_codebooksquantizer_dropoutr,   
ModuleListrangerB   
quantizers)r0   rD   r   r   ir2   s        r   r+   "DacResidualVectorQuantize.__init__  sj    (("44&--ERXRdRdLe(fLeq):6)BLe(fg!2 )gs   A=n_quantizersc                    SnUnSnSn/ n/ nUb  UOU R                   nU R                  (       a  [        R                  " UR                  S   45      U R                   -  S-   n[        R
                  " SU R                   S-   UR                  S   45      n	[        UR                  S   U R                  -  5      n
U	SU
 USU
& UR                  UR                  5      n[        U R                  5       H  u  pU R                  SL a  X:  a    OU" U5      u  pnnn[        R                  " UR                  S   4XR                  S9U:  nX=USS2SS4   -  -   nXM-
  nX^U-  -  nXoU-  -  nUR                  U5        UR                  U5        M     [        R                  " USS9n[        R                  " USS9nX7XU4$ )a  
Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
Args:
    hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Input tensor to be quantized.
    n_quantizers (`int`, *optional*):
        Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
        this argument is ignored during training, and a random number of quantizers is used.

Returns:
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Quantized continuous representation of input.
    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
        Codebook indices for each codebook (quantized discrete representation of input).
    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
        Projected latents (continuous representation of input before quantization).
    commitment_loss (`torch.Tensor` of shape `(1)`):
        Commitment loss to train the encoder to predict vectors closer to codebook entries.
    codebook_loss (`torch.Tensor` of shape `(1)`):
        Codebook loss to update the codebook.
r   Nr	   F)
fill_valuedevicedim)r   trainingr   r.   r7   randintr   r   tor   	enumerater   fullappendstackcat)r0   rY   r   r   residualrZ   r[   r   r   dropout	n_dropoutr   	quantizerquantized_representation_icommitment_loss_icodebook_loss_i	indices_iprojected_latents_imasks                      r   r=   !DacResidualVectorQuantize.forward  s   . $% '3'?|TEUEU== ::|'9'9!'<&>?$BRBRRUVVLmmAt'7'7!';l>P>PQR>S=UVGL..q1D4J4JJKI'.z	':L)$'??<+>+>?L%doo6LA}}%!*;mvnj&?IWj
 ::|11!461M`M`adppD'?_cdegkmqdq_rBr'r$<H 477Ot33My)$$%89% 7( kk+15!II&7Q?'6GZgggr   r   c                 N   Sn/ nUR                   S   n[        U5       Hl  nU R                  U   R                  USS2USS24   5      R	                  SS5      nUR                  U5        X R                  U   R                  U5      -  nMn     U[        R                  " USS9U4$ )a.  
Reconstructs the continuous representation from quantized codes.

Args:
    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
        Quantized discrete representation of input.

Returns:
    quantized_representation (`torch.Tensor`):
        Quantized continuous representation of input.
    projected_latents (`torch.Tensor`):
        List of projected latents (continuous representations of input before quantization)
        for each codebook.
    audio_codes (`torch.Tensor`):
        Codebook indices for each codebook.
g        r	   Nr6   r   )	r7   r   r   rO   rf   r   rL   r   r   )r0   r   r   r   r   r   r   s          r   
from_codes$DacResidualVectorQuantize.from_codesY  s    " $' !''*{#A"&//!"4"="=k!QPQ'>R"S"]"]^_ab"c$$%89$(:(C(CDW(XX$ $ (3D!)LkYYr   latentsc                 t   Sn/ n/ n[         R                  " S/U R                   Vs/ s H  oUR                  PM     sn-   5      n[         R                  " USS9n[
        R                  " XqR                  S   :*  5      S   R                  SSS9S   n[        U5       H|  n	Xy   XyS-      pU R                  U	   R                  USS2X2SS24   5      u  pUR                  U5        UR                  U5        U R                  U	   R                  U5      nX.-   nM~     U[         R                  " USS94$ s  snf )aE  Reconstructs the quantized representation from unquantized latents.

Args:
    latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
        Continuous representation of input after projection.

Returns:
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Quantized representation of the full-projected space.
    quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Quantized representation of the latent space (continuous representation before quantization).
r   r   r	   T)axiskeepdimsN)r   tensorr   rJ   cumsumnpwherer7   rd   r   rU   r   rL   r   )r0   r   r   quantized_latentscodesqcodebook_dims_tensordimsr   r   hidden_dim_jhidden_dim_kquantized_latents_icodes_ir   s                  r   from_latents&DacResidualVectorQuantize.from_latentss  s2    $% $||QC4??2[?a>>?2[,[\||0a8hht}}Q'778;??QQU?VWXY{#A)-$1u+,+/??1+=+L+LWUVXdXqstUtMu+v($$%89LL!)-);)D)DEX)Y&'?'\$ $ (3D!)LLL 3\s   D5
)r   r   r   r   )r   r   r   r   r   r
   r+   r   r   r=   r   Tensorr   r   r   r?   r@   s   @r   r   r   	  sQ    	3y 	3>h(3- >h@Zell Z4MELL M Mr   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )
DacDecoderi  zDAC DecoderrD   c                   > [         T	U ]  5         UR                  nUR                  nUR                  n[
        R                  " X#SSS9U l        / n[        U5       H  u  pgU[        XU5      /-  nM     [
        R                  " U5      U l        UR                  SWS-   -  -  n[        U5      U l        [
        R                  " USSSS9U l        [
        R                  " 5       U l        g )Nru   r   rG   rv   r6   r	   )r*   r+   rI   r   upsampling_ratiosr,   rH   rx   r   r   r   blockr'   rw   rz   Tanhtanh)
r0   rD   input_channelchannelsstridesr   r   r   r   r2   s
            r   r+   DacDecoder.__init__  s    **--** YY}AqQ
 $-g$6 LoflCDDE %7 ]]5)
//19I3JJ
j)YYz1!QG
GGI	r   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )rx   r   rw   rz   r   )r0   rY   layers      r   r=   DacDecoder.forward  sZ    zz,/ZZE .L   {{<0zz,/yy.r   )r   rx   rz   rw   r   
r   r   r   r   r   r
   r+   r=   r   r?   r@   s   @r   r   r     s    y *
 
r   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )
DacEncoderi  zDAC EncoderrD   c           	        > [         TU ]  5         UR                  n[        R                  " SUR
                  SSS9U l        / U l        [        U5       H(  u  p4US-   nU =R                  [        XUS9/-  sl        M*     [        R                  " U R                  5      U l        UR
                  SW-  -  n[        U5      U l        [        R                  " XQR                  SSS9U l        g )Nr	   ru   r   r   )r   r   r6   )r*   r+   downsampling_ratiosr,   rH   r   rx   r   r   r   r   r'   rw   rI   rz   )r0   rD   r   r   r   d_modelr2   s         r   r+   DacEncoder.__init__  s    ,,YYq&"<"<!UVW

$-g$6 L'!+LJJ?6|\]]J %7 ]]4::.
,,q,>g&YYw(:(:STU
r   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                  U5      nU R                  U5      nU$ r   )rx   r   rw   rz   )r0   rY   modules      r   r=   DacEncoder.forward  sL    zz,/jjF!,/L ! {{<0zz,/r   )r   rx   rz   rw   r   r@   s   @r   r   r     s    Vy V$	 	r   r   c                   2    \ rS rSr\rSrSrS rS r	S r
Srg)	DacPreTrainedModeli  dacinput_valuesc                     [        U[        R                  5      (       aS  [        R                  R	                  UR
                  SS9  [        R                  R                  UR                  S5        g g )Ng{Gz?)stdr   )
isinstancer,   rH   inittrunc_normal_r`   	constant_bias)r0   r   s     r   _init_weights DacPreTrainedModel._init_weights  sK    fbii((GG!!&--T!:GGfkk1- )r   c                    [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                  R                   H'  nU" UR                  5        U" UR                  5        M)     U" U R                  R                  5        U" U R                  R                  5        U R                  R                   H  nU" UR                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        M     U" U R                   R                  5        U" U R                   R                  5        U R                   R                   H  nU" UR"                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        U" UR                  R                  5        M     g )Nweight_norm)r,   utilsr  hasattrparametrizationsr   r   rK   rL   encoderrx   rz   r   r   r   r   decoderr   )r0   r  r   s      r   apply_weight_norm$DacPreTrainedModel.apply_weight_norm  s   hh**288,,m<<((33??K^^..E&' / 	DLL&&'DLL&&'\\''E$--.--.--.--.--.--. ( 	DLL&&'DLL&&'\\''E&--.--.--.--.--.--. (r   c                 b   U R                   R                   HU  n[        R                  R	                  UR
                  5        [        R                  R	                  UR                  5        MW     [        R                  R	                  U R                  R                  5        [        R                  R	                  U R                  R                  5        U R                  R                   GH_  n[        R                  R	                  UR                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        GMb     [        R                  R	                  U R                  R                  5        [        R                  R	                  U R                  R                  5        U R                  R                   GH_  n[        R                  R	                  UR                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        [        R                  R	                  UR                  R                  5        GMb     g r   )r   r   r,   r  remove_weight_normrK   rL   r	  rx   rz   r   r   r   r   r
  r   )r0   r   s     r   r  %DacPreTrainedModel.remove_weight_norm  sZ   ^^..EHH''6HH''7 / 	##DLL$6$67
##DLL$6$67\\''EHH''4HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=> ( 	##DLL$6$67
##DLL$6$67\\''EHH''6HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=>HH''(=(=> (r   r   N)r   r   r   r   r
   config_classbase_model_prefixmain_input_namer  r  r  r   r   r   r   r   r     s"    L$O.
/B?r   r   z/
    The DAC (Descript Audio Codec) model.
    )custom_introc            	       $  ^  \ rS rSrS\4U 4S jjr\  SS\R                  S\	\
   S\	\   4S jj5       r\   SS\	\R                     S	\	\R                     S\	\   4S
 jj5       r\  SS\R                  S\	\
   S\	\   4S jj5       rSrU =r$ )DacModeli#  rD   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        [        R                  " U R                  R                  5      5      U l        SU R                  -  U R                  R                  :w  a  [        S5      eU R                  5         g )Nr6   z'The codebook_size must be a power of 2.)r*   r+   rD   r   r	  r   r
  r   r   r   r   log2rN   bits_per_codebook
ValueError	post_initrP   s     r   r+   DacModel.__init__)  s     !&)!&)26:!$TYYt{{/H/H%I!Jd$$$(A(AAFGG 	r   r   r   return_dictc                    Ub  UOU R                   R                  nU R                  U5      nU R                  XB5      u  pEpgnU R                   R                  U-  U R                   R
                  U-  -   n	U(       d  XXV4$ [        XXV5      $ )z
input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
    Input audio data to encode,
n_quantizers (int, *optional*):
    Number of quantizers to use. If None, all quantizers are used. Default is None.
)rD   r  r	  r   commitment_loss_weightcodebook_loss_weightr!   )
r0   r   r   r  r   r   r   rZ   r[   r   s
             r   encodeDacModel.encode9  s     &1%<k$++BYBY#'<<#= cgcqcq$d
` /@S` {{11OCdkkFfFfivFvvKSS__r   r   r   c                    Uc  Uc  [        S5      eUb  UOU R                  R                  nUb  U R                  R	                  U5      S   nU R                  U5      R                  S5      nU(       d  U4$ [        U5      $ )a  
quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
    Quantized continuous representation of input.
audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
    The codebook indices for each codebook, representing the quantized discrete
    representation of the input. This parameter should be provided if you want
    to decode directly from the audio codes (it will overwrite quantized_representation).
zDEither `quantized_representation` or `audio_codes` must be provided.r   r	   )r  rD   r  r   r   r
  squeezer$   )r0   r   r   r  r   s        r   decodeDacModel.decodeT  s      $+0Ccdd%0%<k$++BYBY"'+~~'@'@'Ma'P$||$<=EEaH ?"--r   c                     Ub  UOU R                   R                  nUR                  S   nU R                  XSS9u  pVpxU R	                  USS9S   SSU24   n	U(       d  XYXgU4$ [        XYXgU5      $ )a  
input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`):
    Audio data to encode.
n_quantizers (`int`, *optional*):
    Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.

Examples:

```python
>>> from datasets import load_dataset, Audio
>>> from transformers import DacModel, AutoProcessor
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> model = DacModel.from_pretrained("descript/dac_16khz")
>>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
>>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

>>> encoder_outputs = model.encode(inputs["input_values"])
>>> # Get the intermediate audio codes
>>> audio_codes = encoder_outputs.audio_codes
>>> # Reconstruct the audio from its quantized representation
>>> audio_values = model.decode(encoder_outputs.quantized_representation)
>>> # or the equivalent with a forward pass
>>> audio_values = model(inputs["input_values"]).audio_values
```Nr5   F)r  r   .)rD   r  r7   r   r$  r   )
r0   r   r   r  lengthr   r   r   r   r   s
             r   r=   DacModel.forwards  s    F &1%<k$++BYBY##B'IME JU J
F {{#;{OPQRSVX_Y_X_S_`(@O`aa-ETeffr   )r  rD   r
  r	  r   )NN)NNN)r   r   r   r   r
   r+   r   r   r   r   r   boolr   r$  r=   r   r?   r@   s   @r   r  r  #  s    y    '+&*	`ll` sm` d^	` `4  <@.2&*	."*5<<"8. ell+. d^	. .<  '+&*	,gll,g sm,g d^	,g ,gr   r  )$r   r   dataclassesr   typingr   numpyr   r   torch.nnr,   torch.nn.functional
functionalrV   modeling_utilsr   r  r   r   configuration_dacr
   r   r!   r$   Moduler'   rB   ro   r   r   r   r   r   r   r  __all__r   r   r   <module>r4     s`     !       - 0 ( : : :, :{ : :& 5{ 5 5bii "C1		 C1L"bii "Jbii 0bii >GM		 GMT" "J B F? F? F?R 
xg! xg
xgv +
,r   