
    hZ                     N   S SK r S SKJrJrJrJr  S SKrS SKJrJr  S SK	J
r  S/rS"S\S\S\S	\S
\R                  R                   4
S jjr      S#S\S\S\S\S\\\\\\   4      S\S\S	\S
\R                  R$                  4S jjrS\S
\4S jr " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S\R*                  5      r " S S \R*                  5      r " S! S\R*                  5      rg)$    N)ListOptionalTupleUnion)nnTensor)
functional	Tacotron2in_dimout_dimbiasw_init_gainreturnc                     [         R                  R                  XUS9n[         R                  R                  R	                  UR
                  [         R                  R                  R                  U5      S9  U$ )a  Linear layer with xavier uniform initialization.

Args:
    in_dim (int): Size of each input sample.
    out_dim (int): Size of each output sample.
    bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``)
    w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
        for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

Returns:
    (torch.nn.Linear): The corresponding linear layer.
r   gain)torchr   Linearinitxavier_uniform_weightcalculate_gain)r   r   r   r   linears        S/var/www/auris/envauris/lib/python3.13/site-packages/torchaudio/models/tacotron2.py_get_linear_layerr   )   sR     XX__V4_8F	HHMM!!&--ehhmm6R6RS^6_!`M    in_channelsout_channelskernel_sizestridepaddingdilationc           
      R   Uc'  US-  S:w  a  [        S5      e[        XRS-
  -  S-  5      n[        R                  R	                  U UUUUUUS9n[        R                  R
                  R                  UR                  [        R                  R
                  R                  U5      S9  U$ )a8  1D convolution with xavier uniform initialization.

Args:
    in_channels (int): Number of channels in the input image.
    out_channels (int): Number of channels produced by the convolution.
    kernel_size (int, optional): Number of channels in the input image. (Default: ``1``)
    stride (int, optional): Number of channels in the input image. (Default: ``1``)
    padding (str, int or tuple, optional): Padding added to both sides of the input.
        (Default: dilation * (kernel_size - 1) / 2)
    dilation (int, optional): Number of channels in the input image. (Default: ``1``)
    w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
        for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

Returns:
    (torch.nn.Conv1d): The corresponding Conv1D layer.
      zkernel_size must be odd)r    r!   r"   r#   r   r   )	
ValueErrorintr   r   Conv1dr   r   r   r   )	r   r   r    r!   r"   r#   r   r   conv1ds	            r   _get_conv1d_layerr+   ;   s    4 ?a677h/2Q67XX__  F 
HHMM!!&--ehhmm6R6RS^6_!`Mr   lengthsc                    [         R                  " U 5      R                  5       n[         R                  " SXR                  U R
                  S9nX R                  S5      :  R                  5       n[         R                  " US5      nU$ )aT  Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask
is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths.

Args:
    lengths (Tensor): The length of each element in the batch, with shape (n_batch, ).

Returns:
    mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``).
r   )devicedtyper&   )	r   maxitemaranger.   r/   	unsqueezebytele)r,   max_lenidsmasks       r   _get_mask_from_lengthsr9   i   sf     ii %%'G
,,q'..
NC##A&&,,.D88D!DKr   c                   N   ^  \ rS rSrSrS\S\S\4U 4S jjrS\S\4S	 jrS
r	U =r
$ )_LocationLayerz   a  Location layer used in the Attention model.

Args:
    attention_n_filter (int): Number of filters for attention model.
    attention_kernel_size (int): Kernel size for attention model.
    attention_hidden_dim (int): Dimension of attention hidden representation.
attention_n_filterattention_kernel_sizeattention_hidden_dimc           
         > [         TU ]  5         [        US-
  S-  5      n[        SUUUSSSS9U l        [        XSSS9U l        g )Nr&   r%   F)r    r"   r   r!   r#   tanhr   r   )super__init__r(   r+   location_convr   location_dense)selfr=   r>   r?   r"   	__class__s        r   rD   _LocationLayer.__init__   s]     	,q0A56.-
 05f
r   attention_weights_catr   c                 n    U R                  U5      nUR                  SS5      nU R                  U5      nU$ )aI  Location layer used in the Attention model.

Args:
    attention_weights_cat (Tensor): Cumulative and previous attention weights
        with shape (n_batch, 2, max of ``text_lengths``).

Returns:
    processed_attention (Tensor): Cumulative and previous attention weights
        with shape (n_batch, ``attention_hidden_dim``).
r&   r%   )rE   	transposerF   )rG   rJ   processed_attentions      r   forward_LocationLayer.forward   sA     #001FG1;;AqA"112EF""r   )rE   rF   __name__
__module____qualname____firstlineno____doc__r(   rD   r   rN   __static_attributes____classcell__rH   s   @r   r;   r;   z   sA    

  #
 "	
*#V # # #r   r;   c                      ^  \ rS rSrSrS\S\S\S\S\SS	4U 4S
 jjrS\S\S\S\4S jrS\S\S\S\S\S\	\\4   4S jr
SrU =r$ )
_Attention   a  Locally sensitive attention model.

Args:
    attention_rnn_dim (int): Number of hidden units for RNN.
    encoder_embedding_dim (int): Number of embedding dimensions in the Encoder.
    attention_hidden_dim (int): Dimension of attention hidden representation.
    attention_location_n_filter (int): Number of filters for Attention model.
    attention_location_kernel_size (int): Kernel size for Attention model.
attention_rnn_dimencoder_embedding_dimr?   attention_location_n_filterattention_location_kernel_sizer   Nc                    > [         TU ]  5         [        XSSS9U l        [        X#SSS9U l        [        USSS9U l        [        UUU5      U l        [        S5      * U l	        g )NFrA   rB   r&   r   inf)
rC   rD   r   query_layermemory_layervr;   location_layerfloatscore_mask_value)rG   r\   r]   r?   r^   r_   rH   s         r   rD   _Attention.__init__   sr     	,->[`ntu-!eQW
 ##7G,'* 

 "'ur   queryprocessed_memoryrJ   c                     U R                  UR                  S5      5      nU R                  U5      nU R                  [        R
                  " XE-   U-   5      5      nUR                  S5      nU$ )a  Get the alignment vector.

Args:
    query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step).
    processed_memory (Tensor): Processed Encoder outputs
        with shape (n_batch, max of ``text_lengths``, attention_hidden_dim).
    attention_weights_cat (Tensor): Cumulative and previous attention weights
        with shape (n_batch, 2, max of ``text_lengths``).

Returns:
    alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``).
r&   r%   )rb   r3   re   rd   r   rA   squeeze)rG   ri   rj   rJ   processed_queryprocessed_attention_weightsenergies	alignments           r   _get_alignment_energies"_Attention._get_alignment_energies   se     **5??1+=>&*&9&9:O&P#66%**_%RUe%efg$$Q'	r   attention_hidden_statememoryr8   c                     U R                  XU5      nUR                  XPR                  5      n[        R                  " USS9n[
        R                  " UR                  S5      U5      nUR                  S5      nX4$ )aY  Pass the input through the Attention model.

Args:
    attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``).
    memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
    processed_memory (Tensor): Processed Encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
    attention_weights_cat (Tensor): Previous and cumulative attention weights
        with shape (n_batch, current_num_frames * 2, max of ``text_lengths``).
    mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

Returns:
    attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
    attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
r&   dim)	rq   masked_fillrg   Fsoftmaxr   bmmr3   rl   )	rG   rs   rt   rj   rJ   r8   rp   attention_weightsattention_contexts	            r   rN   _Attention.forward   sv    . 001G[pq	))$0E0EF	IIiQ7!II&7&A&A!&DfM-55a8 33r   )re   rc   rb   rg   rd   )rQ   rR   rS   rT   rU   r(   rD   r   rq   r   rN   rV   rW   rX   s   @r   rZ   rZ      s    ..  #. "	.
 &). ),. 
.*V v fl qw *4 &4 4 !	4
  &4 4 
vv~	4 4r   rZ   c                   T   ^  \ rS rSrSrS\S\\   SS4U 4S jjrS\S\4S	 jr	S
r
U =r$ )_Preneti  zPrenet Module. It is consists of ``len(output_size)`` linear layers.

Args:
    in_dim (int): The size of each input sample.
    output_sizes (list): The output dimension of each linear layers.
r   	out_sizesr   Nc                    > [         TU ]  5         U/US S -   n[        R                  " [	        X25       VVs/ s H  u  pE[        XESS9PM     snn5      U l        g s  snnf )NFr   )rC   rD   r   
ModuleListzipr   layers)rG   r   r   in_sizesin_sizeout_sizerH   s         r   rD   _Prenet.__init__
  sX    8in,mmY\]eYqrYqBU7wu=Yqr
rs   A
xc                     U R                    H3  n[        R                  " [        R                  " U" U5      5      SSS9nM5     U$ )zPass the input through Prenet.

Args:
    x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim).

Return:
    x (Tensor): Tensor with shape (n_batch, sizes[-1])
      ?T)ptraining)r   ry   dropoutrelu)rG   r   r   s      r   rN   _Prenet.forward  s6     kkF		!&&+sTBA "r   )r   )rQ   rR   rS   rT   rU   r(   r   rD   r   rN   rV   rW   rX   s   @r   r   r     s>    
s 
tCy 
T 
 F  r   r   c                   R   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\S	\4S
 jrSr	U =r
$ )_Postneti   zPostnet Module.

Args:
    n_mels (int): Number of mel bins.
    postnet_embedding_dim (int): Postnet embedding dimension.
    postnet_kernel_size (int): Postnet kernel size.
    postnet_n_convolution (int): Number of postnet convolutions.
n_melspostnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutionc                   > [         T
U ]  5         [        R                  " 5       U l        [        U5       H  nUS:X  a  UOUnXTS-
  :X  a  UOUnXTS-
  :X  a  SOSnXTS-
  :X  a  UOUn	U R                  R                  [        R                  " [        UUUS[        US-
  S-  5      SUS9[        R                  " U	5      5      5        M     [        U R                  5      U l        g )Nr   r&   r   rA   r%   r    r!   r"   r#   r   )rC   rD   r   r   convolutionsrangeappend
Sequentialr+   r(   BatchNorm1dlenn_convs)rG   r   r   r   r   ir   r   	init_gainnum_featuresrH   s             r   rD   _Postnet.__init__*  s     	MMO,-A$%F&0EK%&1+D%E6K`L$%!*C$D&I%&1+D%E6K`L$$%#$$7  #%81%<$A B!"$- NN<0 .( 4,,-r   r   r   c                 *   [        U R                  5       Hy  u  p#X R                  S-
  :  a<  [        R                  " [
        R                  " U" U5      5      SU R                  S9nMS  [        R                  " U" U5      SU R                  S9nM{     U$ )zPass the input through Postnet.

Args:
    x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).

Return:
    x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
r&   r   )r   )	enumerater   r   ry   r   r   rA   r   )rG   r   r   convs       r   rN   _Postnet.forwardJ  sn     !!2!23GA<<!##IIejja13OIId1gsT]]C	 4 r   )r   r   rP   rX   s   @r   r   r      sL    ..  #. !	.
  #.@ F  r   r   c                   V   ^  \ rS rSrSrS\S\S\SS4U 4S jjrS	\S
\S\4S jrSr	U =r
$ )_Encoderi]  a  Encoder Module.

Args:
    encoder_embedding_dim (int): Number of embedding dimensions in the encoder.
    encoder_n_convolution (int): Number of convolution layers in the encoder.
    encoder_kernel_size (int): The kernel size in the encoder.

Examples
    >>> encoder = _Encoder(3, 512, 5)
    >>> input = torch.rand(10, 20, 30)
    >>> output = encoder(input)  # shape: (10, 30, 512)
r]   encoder_n_convolutionencoder_kernel_sizer   Nc                   > [         TU ]  5         [        R                  " 5       U l        [        U5       He  n[        R                  " [        UUUS[        US-
  S-  5      SSS9[        R                  " U5      5      nU R                  R                  U5        Mg     [        R                  " U[        US-  5      SSSS9U l        U R                  R                  5         g )Nr&   r%   r   r   T)batch_firstbidirectional)rC   rD   r   r   r   r   r   r+   r(   r   r   LSTMlstmflatten_parameters)rG   r]   r   r   _
conv_layerrH   s         r   rD   _Encoder.__init__k  s     	MMO,-A!)) 3!4q!8A => & 45J $$Z0 . GG!%)*
	 			$$&r   r   input_lengthsc                    U R                    H?  n[        R                  " [        R                  " U" U5      5      SU R                  5      nMA     UR                  SS5      nUR                  5       n[        R                  R                  R                  XSS9nU R                  U5      u  pE[        R                  R                  R                  USS9u  pEU$ )a/  Pass the input through the Encoder.

Args:
    x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq).
    input_lengths (Tensor): The length of each input sequence with shape (n_batch, ).

Return:
    x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim).
r   r&   r%   T)r   )r   ry   r   r   r   rL   cpur   utilsrnnpack_padded_sequencer   pad_packed_sequence)rG   r   r   r   outputsr   s         r   rN   _Encoder.forward  s     %%D		!&&a/3>A & KK1%))+HHLL--aD-QYYq\
XX\\55g45P
r   )r   r   rP   rX   s   @r   r   r   ]  sS    '"'  #' !	'
 
'B  6  r   r   c            !         ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\S\S\S\S\S\SS4U 4S jjrS\	S\	4S jr
S\	S\\	\	\	\	\	\	\	\	4   4S jrS\	S\	4S jrS\	S\	S\	S\\	\	\	4   4S jrS\	S\	S\	S \	S!\	S"\	S#\	S$\	S\	S%\	S&\	S\\	\	\	\	\	\	\	\	\	4	   4S' jrS\	S(\	S)\	S\\	\	\	4   4S* jrS\	S\	4S+ jr\R&                  R(                  S\	S)\	S\\	\	\	\	4   4S, j5       rS-rU =r$ )._Decoderi  a  Decoder with Attention model.

Args:
    n_mels (int): number of mel bins
    n_frames_per_step (int): number of frames processed per step, only 1 is supported
    encoder_embedding_dim (int): the number of embedding dimensions in the encoder.
    decoder_rnn_dim (int): number of units in decoder LSTM
    decoder_max_step (int): maximum number of output mel spectrograms
    decoder_dropout (float): dropout probability for decoder LSTM
    decoder_early_stopping (bool): stop decoding when all samples are finished
    attention_rnn_dim (int): number of units in attention LSTM
    attention_hidden_dim (int): dimension of attention hidden representation
    attention_location_n_filter (int): number of filters for attention model
    attention_location_kernel_size (int): kernel size for attention model
    attention_dropout (float): dropout probability for attention LSTM
    prenet_dim (int): number of ReLU units in prenet layers
    gate_threshold (float): probability threshold for stop token
r   n_frames_per_stepr]   decoder_rnn_dimdecoder_max_stepdecoder_dropoutdecoder_early_stoppingr\   r?   r^   r_   attention_dropout
prenet_dimgate_thresholdr   Nc                   > [         TU ]  5         Xl        X l        X0l        Xl        X@l        Xl        XPl        Xl	        Xl
        X`l        Xpl        [        X-  X/5      U l        [        R                   " X-   U5      U l        [%        UUU	U
U5      U l        [        R                   " X-   US5      U l        [+        XC-   X-  5      U l        [+        XC-   SSSS9U l        g )NTr&   sigmoidrB   )rC   rD   r   r   r]   r\   r   r   r   r   r   r   r   r   prenetr   LSTMCellattention_rnnrZ   attention_layerdecoder_rnnr   linear_projection
gate_layer)rG   r   r   r]   r   r   r   r   r\   r?   r^   r_   r   r   r   rH   s                  r   rD   _Decoder.__init__  s    $ 	!2%:"!2.$ 0,!2.&<#f8::RS[[)KM^_)! '* 
 ;;'8'PRacgh!2?3Z\b\v!w+3QTy
r   rt   c                     UR                  S5      nUR                  nUR                  n[        R                  " X R
                  U R                  -  X4S9nU$ )a=  Gets all zeros frames to use as the first decoder input.

Args:
    memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

Returns:
    decoder_input (Tensor): all zeros frames with shape
        (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``).
r   r/   r.   sizer/   r.   r   zerosr   r   rG   rt   n_batchr/   r.   decoder_inputs         r   _get_initial_frame_Decoder._get_initial_frame  I     ++a.G[[4;Q;Q-QY^nr   c                 J   UR                  S5      nUR                  S5      nUR                  nUR                  n[        R                  " X R
                  XES9n[        R                  " X R
                  XES9n[        R                  " X R                  XES9n[        R                  " X R                  XES9n	[        R                  " X#XES9n
[        R                  " X#XES9n[        R                  " X R                  XES9nU R                  R                  U5      nUUUU	U
UUU4$ )a  Initializes attention rnn states, decoder rnn states, attention
weights, attention cumulative weights, attention context, stores memory
and stores processed memory.

Args:
    memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

Returns:
    attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
    attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
    attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
    processed_memory (Tensor): Processed encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
r   r&   r   )
r   r/   r.   r   r   r\   r   r]   r   rc   )rG   rt   r   max_timer/   r.   attention_hiddenattention_celldecoder_hiddendecoder_cellr|   attention_weights_cumr}   rj   s                 r   _initialize_decoder_states#_Decoder._initialize_decoder_states  s    * ++a.;;q> ;;w0F0FecW.D.DEaW.B.B%_{{7,@,@]!KKV %GU Z!KK1K1KSXh//<<VD !	
 		
r   decoder_inputsc                     UR                  SS5      nUR                  UR                  S5      [        UR                  S5      U R                  -  5      S5      nUR                  SS5      nU$ )a;  Prepares decoder inputs.

Args:
    decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs,
        with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)

Returns:
    inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``).
r&   r%   r   r   )rL   viewr   r(   r   )rG   r   s     r   _parse_decoder_inputs_Decoder._parse_decoder_inputs.  so     (11!Q7',,"##A&)?)??@
 (11!Q7r   mel_specgramgate_outputs
alignmentsc                 D   UR                  SS5      R                  5       nUR                  SS5      R                  5       nUR                  SS5      R                  5       nUR                  S   SU R                  4nUR                  " U6 nUR                  SS5      nXU4$ )a  Prepares decoder outputs for output

Args:
    mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``)
    gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch)
    alignments (Tensor): sequence of attention weights from the decoder
        with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``)

Returns:
    mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
    gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``)
    alignments (Tensor): sequence of attention weights from the decoder
        with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``)
r   r&   r   r%   )rL   
contiguousshaper   r   )rG   r   r   r   r   s        r   _parse_decoder_outputs_Decoder._parse_decoder_outputsC  s    &  ))!Q/::<
#--a3>>@#--a3>>@##A&DKK8#((%0#--a3:55r   r   r   r   r   r   r|   r   r}   rj   r8   c           	         [         R                  " X4S5      nU R                  XU45      u  p#[        R                  " X R
                  U R                  5      n[         R                  " UR                  S5      UR                  S5      4SS9nU R                  X)XU5      u  pXv-  n[         R                  " X(4S5      nU R                  XU45      u  pE[        R                  " X@R                  U R                  5      n[         R                  " XH4SS9nU R                  U5      nU R                  U5      nUUUUUUUUU4	$ )af  Decoder step using stored states, attention and memory

Args:
    decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``).
    attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
    attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
    attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
    memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
    processed_memory (Tensor): Processed Encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
    mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

Returns:
    decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``).
    gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``).
    attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
    decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
    attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
    attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
    attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
r   r&   rv   )r   catr   ry   r   r   r   r3   r   r   r   r   r   )rG   r   r   r   r   r   r|   r   r}   rt   rj   r8   
cell_inputrJ    decoder_hidden_attention_contextdecoder_outputgate_predictions                    r   decode_Decoder.decodec  sH   R YYA2F
+/+=+=j]kJl+m(99%57M7Mt}}] %		+<+F+Fq+IK`KjKjklKm*ntu v/3/C/C&6t0
, 	2		#3"GL'+'7'7XdGe'f$>3G3GW+099n5X^_+`(//0PQ//*JK !

 
	
r   mel_specgram_truthmemory_lengthsc                    U R                  U5      R                  S5      nU R                  U5      n[        R                  " XE4SS9nU R                  U5      n[        U5      nU R                  U5      u  nnn	n
nnnn/ / / nnn[        U5      UR                  S5      S-
  :  a  U[        U5         nU R                  UUUU	U
UUUUUU5      u	  nnnnn	n
nnnUUR                  S5      /-  nUUR                  S5      /-  nUU/-  n[        U5      UR                  S5      S-
  :  a  M  U R                  [        R                  " U5      [        R                  " U5      [        R                  " U5      5      u  nnnUUU4$ )av  Decoder forward pass for training.

Args:
    memory (Tensor): Encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
    mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing
        with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
    memory_lengths (Tensor): Encoder output lengths for attention masking
        (the same as ``text_lengths``) with shape (n_batch, ).

Returns:
    mel_specgram (Tensor): Predicted mel spectrogram
        with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
    gate_outputs (Tensor): Predicted stop token for each timestep
        with shape (n_batch,  max of ``mel_specgram_lengths``).
    alignments (Tensor): Sequence of attention weights from the decoder
        with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
r   rv   r&   )r   r3   r   r   r   r   r9   r   r   r   r   rl   r   stack)rG   rt   r   r   r   r   r8   r   r   r   r   r|   r   r}   rj   mel_outputsr   r   
mel_outputgate_outputr   s                        r   rN   _Decoder.forward  s   , //7AA!D334FGM#BJ^4%n5 ++F3		
! 13B:\+!4!4Q!7!!;;*3{+;<M  !%! 
 !%! J..q122K[00344L,--J9 +!4!4Q!7!!;;< 261L1LKK$ekk,&?ZAX2
.lJ \:55r   c                     UR                  S5      nUR                  nUR                  n[        R                  " X R
                  U R                  -  X4S9nU$ )a%  Gets all zeros frames to use as the first decoder input

args:
    memory (Tensor): Encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

returns:
    decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``).
r   r   r   r   s         r   _get_go_frame_Decoder._get_go_frame  r   r   c                    UR                  S5      UR                  pCU R                  U5      n[        U5      nU R	                  U5      u  nnn	n
nnnn[
        R                  " U/[
        R                  US9n[
        R                  " U/[
        R                  US9n/ n/ n/ n[        U R                  5       H  nU R                  U5      nU R                  UUUU	U
UUUUUU5      u	  nnnnn	n
nnnUR                  UR                  S5      5        UR                  UR                  SS5      5        UR                  U5        UU) ==   S-  ss'   U[
        R                   " UR#                  S5      5      U R$                  :  -  nU R&                  (       a  [
        R(                  " U5      (       a    OUnGM     [+        U5      U R                  :X  a  [,        R.                  " S5        [
        R0                  " USS9n[
        R0                  " USS9n[
        R0                  " USS9nU R3                  UUU5      u  nnnUUUU4$ )a"  Decoder inference

Args:
    memory (Tensor): Encoder outputs
        with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
    memory_lengths (Tensor): Encoder output lengths for attention masking
        (the same as ``text_lengths``) with shape (n_batch, ).

Returns:
    mel_specgram (Tensor): Predicted mel spectrogram
        with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
    mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, ))
    gate_outputs (Tensor): Predicted stop token for each timestep
        with shape (n_batch,  max of ``mel_specgram_lengths``).
    alignments (Tensor): Sequence of attention weights from the decoder
        with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
r   r   r&   zZReached max decoder steps. The generated spectrogram might not cover the whole transcript.rv   )r   r.   r  r9   r   r   r   int32boolr   r   r   r   r   r3   rL   r   rl   r   r   allr   warningswarnr   r   )rG   rt   r   
batch_sizer.   r   r8   r   r   r   r   r|   r   r}   rj   mel_specgram_lengthsfinishedmel_specgramsr   r   r   r   r  s                          r   infer_Decoder.infer
  s7   & $[[^V]]F**62%n5 ++F3		
!  %{{J<u{{SYZ;;
|5::fM&(%'#%
t,,-A KK6M  !%! 
 !%!   !7!7!:; 5 5a ;</0 (+q0+k&9&9!&<=@S@SSSH**uyy/B/B(MG .J }!6!66MMo 		-Q7yy15YYzq1
262M2Mm]iku2v/|Z2L*LLr   )r   r   r   r\   r   r   r   r   r   r]   r   r   r   r   r   r   r   )rQ   rR   rS   rT   rU   r(   rf   r
  rD   r   r   r   r   r   r   r   rN   r  r   jitexportr  rV   rW   rX   s   @r   r   r     s^   &1
1
 1
  #	1

 1
 1
 1
 !%1
 1
 "1
 &)1
 ),1
 !1
 1
 1
  
!1
f F "/
/
	vvvvvvvvM	N/
bF v *6"6286FL6	vvv%	&6@H
H
 !H
 	H

 H
 H
 "H
  &H
 "H
 H
 !H
 H
 
vvvvvvvvvU	VH
TJ6J628J6JPJ6	vvv%	&J6XF v " YYWMF WMF WMuVVU[]cEc?d WM WMr   r   c            /       F  ^  \ rS rSrSr                      S$S\S\S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\S\S\S\S\S\S\SS4.U 4S jjjrS\	S\	S\	S\	S\
\	\	\	\	4   4
S  jr\R                  R                  S%S\	S!\\	   S\
\	\	\	4   4S" jj5       rS#rU =r$ )&r
   ie  aa	  Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
:cite:`shen2018natural` based on the implementation from
`Nvidia Deep Learning Examples <https://github.com/NVIDIA/DeepLearningExamples/>`_.

See Also:
    * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

Args:
    mask_padding (bool, optional): Use mask padding (Default: ``False``).
    n_mels (int, optional): Number of mel bins (Default: ``80``).
    n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
    n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
    symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
    encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
    encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
    encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
    decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
    decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
    decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
    decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
    attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
    attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
    attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
    attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
    attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
    prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
    postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
    postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
    postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
    gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
mask_paddingr   n_symbolr   symbol_embedding_dimr]   r   r   r   r   r   r   r\   r?   r^   r_   r   r   r   r   r   r   r   Nc                 x  > [         TU ]  5         Xl        X l        X@l        [
        R                  " X55      U l        [        R
                  R                  R                  U R                  R                  5        [        XgU5      U l        [        UUUU	U
UUUUUUUUU5      U l        [!        UUUU5      U l        g N)rC   rD   r  r   r   r   	Embedding	embeddingr   r   r   r   r   encoderr   decoderr   postnet)rG   r  r   r  r   r  r]   r   r   r   r   r   r   r\   r?   r^   r_   r   r   r   r   r   r   rH   s                          r   rD   Tacotron2.__init__  s    2 	(!2hE%%dnn&;&;< 5Nab!" '*
   (=?RTijr   tokenstoken_lengthsr   r  c                    U R                  U5      R                  SS5      nU R                  XR5      nU R                  XcUS9u  p7nU R	                  U5      n	X9-   n	U R
                  (       a  [        U5      n
U
R                  U R                  U
R                  S5      U
R                  S5      5      n
U
R                  SSS5      n
UR                  U
S5        U	R                  U
S5        UR                  U
SS2SSS24   S5        X9Xx4$ )a  Pass the input through the Tacotron2 model. This is in teacher
forcing mode, which is generally used for training.

The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.

Args:
    tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
    token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
    mel_specgram (Tensor): The target mel spectrogram
        with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
    mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.

Returns:
    [Tensor, Tensor, Tensor, Tensor]:
        Tensor
            Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
        Tensor
            Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
        Tensor
            The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
        Tensor
            Sequence of attention weights from the decoder with
            shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
r&   r%   )r   r   g        Ng     @@)r  rL   r  r  r   r  r9   expandr   r   permutemasked_fill_)rG   r"  r#  r   r  embedded_inputsencoder_outputsr   r   mel_specgram_postnetr8   s              r   rN   Tacotron2.forward  s    B ..0::1a@,,F15- 2> 2
.J  $||L9+B)*>?D;;t{{DIIaL$))A,GD<<1a(D%%dC0 --dC8%%d1a7mS9<KKr   r,   c                    UR                   u  p4UcJ  [        R                  " U/5      R                  U5      R	                  UR
                  UR                  5      nUc   eU R                  U5      R                  SS5      nU R                  XR5      nU R                  R                  Xb5      u  pxpU R                  U5      nX{-   nU
R                  SX35      R                  SS5      n
XU
4$ )a  Using Tacotron2 for inference. The input is a batch of encoded
sentences (``tokens``) and its corresponding lengths (``lengths``). The
output is the generated mel spectrograms, its corresponding lengths, and
the attention weights from the decoder.

The input `tokens` should be padded with zeros to length max of ``lengths``.

Args:
    tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
    lengths (Tensor or None, optional):
        The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
        If ``None``, it is assumed that the all the tokens are valid. Default: ``None``

Returns:
    (Tensor, Tensor, Tensor):
        Tensor
            The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
        Tensor
            The length of the predicted mel spectrogram with shape `(n_batch, )`.
        Tensor
            Sequence of attention weights from the decoder with shape
            `(n_batch, max of mel_specgram_lengths, max of lengths)`.
r&   r%   r   )r   r   tensorr%  tor.   r/   r  rL   r  r  r  r   unfold)rG   r"  r,   r   
max_lengthr(  r)  r   r  r   r   mel_outputs_postnets               r   r  Tacotron2.infer  s    2 %ll?llJ<077@CCFMMSYS_S_`G"""..0::1a@,,@<@LL<N<N<h9A"ll<8*@&&q';EEaK
"*DDr   )r  r  r  r  r   r   r   )FP      r&      r5           i  皙?Tr8            r9     r7  r7  r5  r   r  )rQ   rR   rS   rT   rU   r
  r(   rf   rD   r   r   rN   r   r  r  r   r  rV   rW   rX   s   @r   r
   r
   e  s   D #!"$'%(%&#$# $!$'+!%$'+-.0#&%&#$%( #/1k1k 1k 	1k
 1k "1k  #1k  #1k !1k 1k 1k 1k !%1k 1k "1k  &)!1k" ),#1k$ !%1k& '1k(  #)1k* !+1k,  #-1k. /1k0 
11k 1kf4L4L 4L 	4L
 %4L 
vvvv-	.4Ll YY&EF &EXf-= &EvW]_eOeIf &E &Er   )Tr   )r&   r&   Nr&   Tr   )r  typingr   r   r   r   r   r   r   torch.nnr	   ry   __all__r(   r
  strr   r   r)   r+   r9   Moduler;   rZ   r   r   r   r   r
    r   r   <module>rD     s|  8  / /   $ 
c C t QT didldldsds * 59+++ + 	+
 eCeCj012+ + + + XX__+\F v ".#RYY .#bT4 T4nbii <:ryy :zEryy EP}Mryy }M@qE		 qEr   