
    hV<                     $   S SK r S SKJrJrJr  S SKrS SKJs  Jr	  S SKJrJ
r
  / SQr " S S\R                  5      r " S S\R                  5      r " S	 S
\R                  5      r " S S\R                  5      r " S S\R                  5      rg)    N)ListOptionalTuple)nnTensor)ResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                   N   ^  \ rS rSrSrS
S\SS4U 4S jjjrS\S\4S jrS	r	U =r
$ )r      aJ  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

Args:
    n_freq: the number of bins in a spectrogram. (Default: ``128``)

Examples
    >>> resblock = ResBlock()
    >>> input = torch.rand(10, 128, 512)  # a random spectrogram
    >>> output = resblock(input)  # shape: (10, 128, 512)
n_freqreturnNc                 $  > [         TU ]  5         [        R                  " [        R                  " XSSS9[        R
                  " U5      [        R                  " SS9[        R                  " XSSS9[        R
                  " U5      5      U l        g )N   Fin_channelsout_channelskernel_sizebiasTinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__s     Q/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/wavernn.pyr   ResBlock.__init__   se     mmII&1SXYNN6"GGD!II&1SXYNN6"
    specgramc                 *    U R                  U5      U-   $ )zPass the input through the ResBlock layer.
Args:
    specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

Return:
    Tensor shape: (n_batch, n_freq, n_time)
r    r!   r&   s     r#   forwardResBlock.forward(   s     ""8,x77r%   r(   )   __name__
__module____qualname____firstlineno____doc__intr   r   r*   __static_attributes____classcell__r"   s   @r#   r   r      s8    		
s 	
T 	
 	
	8 	86 	8 	8r%   r   c                   `   ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\S\4S jrSr	U =r
$ )r	   4   ac  MelResNet layer uses a stack of ResBlocks on spectrogram.

Args:
    n_res_block: the number of ResBlock in stack. (Default: ``10``)
    n_freq: the number of bins in a spectrogram. (Default: ``128``)
    n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
    n_output: the number of output dimensions of melresnet. (Default: ``128``)
    kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

Examples
    >>> melresnet = MelResNet()
    >>> input = torch.rand(10, 128, 512)  # a random spectrogram
    >>> output = melresnet(input)  # shape: (10, 128, 508)
n_res_blockr   n_hiddenn_outputr   r   Nc           	      N  > [         TU ]  5         [        U5       Vs/ s H  n[        U5      PM     nn[        R
                  " [        R                  " X#USS9[        R                  " U5      [        R                  " SS9/UQ[        R                  " X4SS9P76 U l	        g s  snf )NFr   Tr   r   )r   r   r   )
r   r   ranger   r   r   r   r   r   melresnet_model)	r!   r9   r   r:   r;   r   _	ResBlocksr"   s	           r#   r   MelResNet.__init__D   s     	16{1CD1CAXh'1C	D!}}II&[_deNN8$GGD! 
 	 

 II(qQ 
 Es   B"r&   c                 $    U R                  U5      $ )zPass the input through the MelResNet layer.
Args:
    specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

Return:
    Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
r>   r)   s     r#   r*   MelResNet.forwardS   s     ##H--r%   rC   
   r,   r,   r,      r-   r6   s   @r#   r	   r	   4   sa      vw

-0
BE
WZ
or
	
 
	. 	.6 	. 	.r%   r	   c                   N   ^  \ rS rSrSrS\S\SS4U 4S jjrS\S\4S	 jrS
r	U =r
$ )r
   _   aq  Upscale the frequency and time dimensions of a spectrogram.

Args:
    time_scale: the scale factor in time dimension
    freq_scale: the scale factor in frequency dimension

Examples
    >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

    >>> input = torch.rand(10, 100, 512)  # a random spectrogram
    >>> output = stretch2d(input)  # shape: (10, 500, 5120)

time_scale
freq_scaler   Nc                 :   > [         TU ]  5         X l        Xl        g N)r   r   rK   rJ   )r!   rJ   rK   r"   s      r#   r   Stretch2d.__init__m   s    $$r%   r&   c                 n    UR                  U R                  S5      R                  U R                  S5      $ )zPass the input through the Stretch2d layer.

Args:
    specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

Return:
    Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
)repeat_interleaverK   rJ   r)   s     r#   r*   Stretch2d.forwards   s0     ))$//2>PPQUQ`Q`bdeer%   )rK   rJ   r-   r6   s   @r#   r
   r
   _   s>    %3 %C %D %
f 
f6 
f 
fr%   r
   c                   |   ^  \ rS rSrSr     SS\\   S\S\S\S\S\S	S
4U 4S jjjrS\S	\	\\4   4S jr
SrU =r$ )r   r,   a  Upscale the dimensions of a spectrogram.

Args:
    upsample_scales: the list of upsample scales.
    n_res_block: the number of ResBlock in stack. (Default: ``10``)
    n_freq: the number of bins in a spectrogram. (Default: ``128``)
    n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
    n_output: the number of output dimensions of melresnet. (Default: ``128``)
    kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

Examples
    >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
    >>> input = torch.rand(10, 128, 10)  # a random spectrogram
    >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
upsample_scalesr9   r   r:   r;   r   r   Nc           	        > [         TU ]  5         SnU H  nXx-  nM	     Xpl        US-
  S-  U-  U l        [	        X#XEU5      U l        [        US5      U l        / n	U H  n
[        U
S5      n[        R                  " SSSU
S-  S-   4SU
4SS9n[        R                  R                  R                  UR                  SU
S-  S-   -  5        U	R                  U5        U	R                  U5        M     [        R                  " U	6 U l        g )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr	   resnetr
   resnet_stretchr   Conv2dtorchinit	constant_weightappendr   upsample_layers)r!   rU   r9   r   r:   r;   r   rZ   upsample_scale	up_layersscalestretchconvr"   s                r#   r   UpsampleNetwork.__init__   s    	-N)K . +"Q1,{:XU'Q7	$Eq)G99AAuqy1};MXY[`WahmD HHMM##DKK	A1FGW%T" %  "}}i8r%   r&   c                 4   U R                  U5      R                  S5      nU R                  U5      nUR                  S5      nUR                  S5      nU R	                  U5      nUR                  S5      SS2SS2U R
                  U R
                  * 24   nX24$ )a  Pass the input through the UpsampleNetwork layer.

Args:
    specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

Return:
    Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                  (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
where total_scale is the product of all elements in upsample_scales.
r   N)r\   	unsqueezer]   squeezerd   r[   )r!   r&   resnet_outputupsampling_outputs       r#   r*   UpsampleNetwork.forward   s     H-77:++M:%--a0%%a( 00:-55a8At{{dkk\?Y9YZ //r%   )r[   r\   r]   rZ   rd   rE   )r.   r/   r0   r1   r2   r   r3   r   r   r   r*   r4   r5   r6   s   @r#   r   r      s    & 9c9 9 	9
 9 9 9 
9 9>0 05+@ 0 0r%   r   c                      ^  \ rS rSrSr       SS\\   S\S\S\S\S\S	\S
\S\S\SS4U 4S jjjrS\S\S\4S jr	\
R                  R                  SS\S\\   S\\\\   4   4S jj5       rSrU =r$ )r      a  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

The original implementation was introduced in *Efficient Neural Audio Synthesis*
:cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
The product of `upsample_scales` must equal `hop_length`.

See Also:
    * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
    * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

Args:
    upsample_scales: the list of upsample scales.
    n_classes: the number of output classes.
    hop_length: the number of samples between the starts of consecutive frames.
    n_res_block: the number of ResBlock in stack. (Default: ``10``)
    n_rnn: the dimension of RNN layer. (Default: ``512``)
    n_fc: the dimension of fully connected layer. (Default: ``512``)
    kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
    n_freq: the number of bins in a spectrogram. (Default: ``128``)
    n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
    n_output: the number of output dimensions of melresnet. (Default: ``128``)

Example
    >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
    >>> waveform, sample_rate = torchaudio.load(file)
    >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
    >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
    >>> output = wavernn(waveform, specgram)
    >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
rU   	n_classes
hop_lengthr9   n_rnnn_fcr   r   r:   r;   r   Nc                   > [         TU ]  5         Xpl        US-  (       a  US-
  OUS-  U l        XPl        U
S-  U l        X0l        X l        [        [        R                  " U R                  5      5      U l        SnU H  nX-  nM	     XR                  :w  a  [        SU SU 35      e[        XXX5      U l        [        R                   " XR
                  -   S-   U5      U l        [        R$                  " XUSS9U l        [        R$                  " XPR
                  -   USS9U l        [        R*                  " SS9U l        [        R*                  " SS9U l        [        R                   " XPR
                  -   U5      U l        [        R                   " X`R
                  -   U5      U l        [        R                   " X`R                  5      U l        g )	NrW   r      z/Expected: total_scale == hop_length, but found z != T)batch_firstr   )r   r   r   _padru   n_auxrt   rs   r3   mathlog2n_bits
ValueErrorr   upsampler   LinearfcGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r!   rU   rs   rt   r9   ru   rv   r   r   r:   r;   rZ   re   r"   s                r#   r   WaveRNN.__init__   s`    	&(3a[1_[QN	
]
$"tyy89-N)K .//)N{m[_`j_klmm'fX`n))FZZ/!3U;FF5T:	FF5::-u$G	WWT*
WWT*
99UZZ/699TJJ.599T>>2r%   waveformr&   c                 B   UR                  S5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eUR                  S5      UR                  S5      p!UR                  S5      n[        R                  " SX0R
                  UR                  UR                  S9n[        R                  " SX0R
                  UR                  UR                  S9nU R                  U5      u  p&UR                  SS5      nUR                  SS5      n[        S5       Vs/ s H  opR                  U-  PM     nnUSS2SS2US   US   24   n	USS2SS2US   US   24   n
USS2SS2US   US	   24   nUSS2SS2US	   US
   24   n[        R                  " UR                  S5      X)/SS9nU R                  U5      nUnU R                  X5      u  pX-   nUn[        R                  " X/SS9nU R!                  X5      u  pX-   n[        R                  " X/SS9nU R#                  U5      nU R%                  U5      n[        R                  " X/SS9nU R'                  U5      nU R)                  U5      nU R+                  U5      nUR                  S5      $ s  snf )aO  Pass the input through the WaveRNN model.

Args:
    waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
    specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

Return:
    Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicerW   rG   N   rx   rQ   dim)sizer   rm   r_   zerosru   r   r   r   	transposer=   r{   catrl   r   r   r   r   r   r   r   r   )r!   r   r&   
batch_sizeh1h2auxiaux_idxa1a2a3a4xresr?   s                   r#   r*   WaveRNN.forward  sj    ==q IJJ==q IJJ%--a0(2B2B12E(]]1%
[[J

(..QYQ`Q`a[[J

(..QYQ`Q`a h/%%a+mmAq!+0848a::>84AwqzGAJ../AwqzGAJ../AwqzGAJ../AwqzGAJ../IIx))"-x<"EGGAJyyGIIqg2&yyGIIqg2&HHQKJJqMIIqg2&HHQKJJqMHHQK {{1~7 5s   !Jlengthsc           	      \   UR                   nUR                  n[        R                  R                  R                  XR                  U R                  45      nU R                  U5      u  pUb  X R                  R                  -  n/ nUR                  5       u  pxn	[        R                  " SXpR                  4X4S9n
[        R                  " SXpR                  4X4S9n[        R                  " US4X4S9n[        S5       Vs/ s H-  oSS2U R                  U-  U R                  US-   -  2SS24   PM/     nn[        U	5       GH  nUSS2SS2U4   nU Vs/ s H  nUSS2SS2U4   PM     snu  nnnn[        R                  " XU/SS9nU R                  U5      nU R!                  UR#                  S5      U
5      u  pXS   -   n[        R                  " UU/SS9nU R%                  UR#                  S5      U5      u  pXS   -   n[        R                  " UU/SS9n[&        R(                  " U R+                  U5      5      n[        R                  " UU/SS9n[&        R(                  " U R-                  U5      5      nU R/                  U5      n[&        R0                  " USS9n[        R2                  " US5      R5                  5       nSU-  SU R6                  -  S-
  -  S-
  nUR9                  U5        GM     [        R:                  " U5      R=                  SSS5      U4$ s  snf s  snf )	a  Inference method of WaveRNN.

This function currently only supports multinomial sampling, which assumes the
network is trained on cross entropy loss.

Args:
    specgram (Tensor):
        Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
    lengths (Tensor or None, optional):
        Indicates the valid length of each audio in the batch.
        Shape: `(batch, )`.
        When the ``specgram`` contains spectrograms with different durations,
        by providing ``lengths`` argument, the model will compute
        the corresponding valid output lengths.
        If ``None``, it is assumed that all the audio in ``waveforms``
        have valid length. Default: ``None``.

Returns:
    (Tensor, Optional[Tensor]):
    Tensor
        The inferred waveform of size `(n_batch, 1, n_time)`.
        1 stands for a single channel.
    Tensor or None
        If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
        is returned.
        It indicates the valid length in time axis of the output Tensor.
Nr   )r   r   rx   r   r   rW   rY   )r   r   r_   r   
functionalpadrz   r   rZ   r   r   ru   r=   r{   r   r   r   rl   r   Frelur   r   r   softmaxmultinomialfloatr~   rc   stackpermute)r!   r&   r   r   r   r   outputb_sizer?   seq_lenr   r   r   r   	aux_splitm_taa1_ta2_ta3_ta4_tinplogits	posteriors                           r#   inferWaveRNN.inferK  s   < 88&&**8ii5KLh/ 9 99G!%]]_7[[!VZZ0M[[!VZZ0MKKF@OTUVxXx!DJJNTZZ1q5-AA1DEx	XwA1a7#C:C%D)Qa1aj)%D"D$d		14.a0A
AIIakk!nb1EAqE	A))QI1-CIIcmmA.3EAqE	A		1d)+Atxx{#A		1d)+Atxx{#AXXa[F		&a0I!!)Q/557AADKK#-.4AMM!;  > {{6"**1a3W<<C Y &Es   4L$L))rz   r   r   r   r   rt   r   r{   r~   rs   ru   r   r   r   r   r   )rF      r   rG   r,   r,   r,   rM   )r.   r/   r0   r1   r2   r   r3   r   r   r*   r_   jitexportr   r   r   r4   r5   r6   s   @r#   r   r      s   J (3c(3 (3 	(3
 (3 (3 (3 (3 (3 (3 (3 
(3 (3T7 7& 7V 7r YYM=f M=x/? M=5QWYabhYiQiKj M= M=r%   r   )r|   typingr   r   r   r_   torch.nn.functionalr   r   r   r   __all__Moduler   r	   r
   r   r    r%   r#   <module>r      s{     ( (     8ryy  8F(.		 (.Vf		 fBD0bii D0NR=bii R=r%   