a
    h<                     @   s&  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d	d
lmZ d	dlmZ g ZdZG dd dejZG dd dejZG dd de
jj ej!Z"G dd de
jj ej!Z#G dd dZ$G dd dZ%eG dd dZ&eG dd dZ'G dd dZ(eG dd  d e'e&e$eZ)eG d!d" d"e'e&e%eZ*eG d#d$ d$e(e&e$eZ+eG d%d& d&e(e&e%eZ,e+d'ej-d(d)d*Z.d+e._/e,d,ej-d-d)d*Z0d.e0_/e)d/ej-d(d)d0e1 d1Z2d2e2_/e*d3ej-d-d)d0e1 d1Z3d4e3_/dS )5    N)	dataclass)AnyDictListOptionalTupleUnion)Tensor)load_state_dict_from_url)mu_law_decoding)	Tacotron2WaveRNN)
GriffinLimInverseMelScale   )utils)Tacotron2TTSBundlez.https://download.pytorch.org/torchaudio/modelsc                       sL   e Zd Z fddZedd Zeeee f e	e
e
f dddZ  ZS )_EnglishCharProcessorc                    s.   t    t | _dd t| jD | _d S )Nc                 S   s   i | ]\}}||qS  r   ).0isr   r   L/var/www/auris/lib/python3.9/site-packages/torchaudio/pipelines/_tts/impl.py
<dictcomp>       z2_EnglishCharProcessor.__init__.<locals>.<dictcomp>)super__init__r   Z
_get_chars_tokens	enumerate_mappingself	__class__r   r   r      s    

z_EnglishCharProcessor.__init__c                 C   s   | j S Nr   r    r   r   r   tokens   s    z_EnglishCharProcessor.tokenstextsreturnc                    s,   t |tr|g} fdd|D }t|S )Nc                    s"   g | ]} fd d|  D qS )c                    s    g | ]}| j v r j | qS r   r   )r   cr    r   r   
<listcomp>&   r   z=_EnglishCharProcessor.__call__.<locals>.<listcomp>.<listcomp>)lower)r   tr    r   r   r,   &   r   z2_EnglishCharProcessor.__call__.<locals>.<listcomp>)
isinstancestrr   
_to_tensor)r!   r(   indicesr   r    r   __call__#   s    
z_EnglishCharProcessor.__call____name__
__module____qualname__r   propertyr&   r   r0   r   r   r	   r3   __classcell__r   r   r"   r   r      s   
r   c                       sR   e Zd Zdd fdd
Zedd Zeeee f e	e
e
f ddd	Z  ZS )
_EnglishPhoneProcessorN	dl_kwargsc                   sD   t    t | _dd t| jD | _tjd|d| _d| _	d S )Nc                 S   s   i | ]\}}||qS r   r   )r   r   pr   r   r   r   .   r   z3_EnglishPhoneProcessor.__init__.<locals>.<dictcomp>zen_us_cmudict_forward.ptr;   z(\[[A-Z]+?\]|[_!'(),.:;? -]))
r   r   r   Z_get_phonesr   r   r   Z_load_phonemizer_phonemizer_patternr!   r<   r"   r   r   r   +   s
    

z_EnglishPhoneProcessor.__init__c                 C   s   | j S r$   r%   r    r   r   r   r&   2   s    z_EnglishPhoneProcessor.tokensr'   c                    sb   t |tr|g}g } j|ddD ]4}dd t j|D }| fdd|D  q"t|S )Nen_us)langc                 S   s   g | ]}t d d|qS )z[\[\]] )resub)r   rr   r   r   r,   =   r   z3_EnglishPhoneProcessor.__call__.<locals>.<listcomp>c                    s   g | ]} j | qS r   r*   )r   r=   r    r   r   r,   >   r   )	r/   r0   r>   rD   findallr?   appendr   r1   )r!   r(   r2   Zphonesretr   r    r   r3   6   s    
z_EnglishPhoneProcessor.__call__r4   r   r   r"   r   r:   *   s   
r:   c                       s@   e Zd Zd
eee d fddZedd Zddd	Z	  Z
S )_WaveRNNVocoder)modelmin_level_dbc                    s    t    d| _|| _|| _d S )N"V  )r   r   _sample_rate_model_min_level_db)r!   rL   rM   r"   r   r   r   H   s    
z_WaveRNNVocoder.__init__c                 C   s   | j S r$   rO   r    r   r   r   sample_rateN   s    z_WaveRNNVocoder.sample_rateNc                 C   s   t |}dt t j|dd }| jd urL| j| | j }t j|ddd}| j||\}}t|| jj	}t
|| jj}|d}||fS )N   gh㈵>)minr   r   )rU   max)torchexplog10clamprQ   rP   Zinferr   Z_unnormalize_waveformZn_bitsr   Z	n_classesZsqueeze)r!   mel_speclengthsZwaveformr   r   r   forwardR   s    


z_WaveRNNVocoder.forward)rK   )N)r5   r6   r7   r   r   floatr   r8   rS   r]   r9   r   r   r"   r   rJ   G   s   
rJ   c                       s2   e Zd Z fddZedd ZdddZ  ZS )	_GriffinLimVocoderc              	      s@   t    d| _tdd| jddddd| _tdd	d
dd| _d S )NrN   i  P   g        g     @@Zslaney)Zn_stftZn_melsrS   Zf_minZf_maxZ	mel_scaleZnormi   r      )Zn_fftpowerZ
hop_lengthZ
win_length)r   r   rO   r   rS   _inv_melr   _griffin_limr    r"   r   r   r   `   s"    
	z_GriffinLimVocoder.__init__c                 C   s   | j S r$   rR   r    r   r   r   rS   s   s    z_GriffinLimVocoder.sample_rateNc                 C   sF   t |}|  d}| |}| d}| |}||fS )NTF)rW   rX   clonedetachZrequires_grad_rc   rd   )r!   r[   r\   specZ	waveformsr   r   r   r]   w   s    


z_GriffinLimVocoder.forward)N)r5   r6   r7   r   r8   rS   r]   r9   r   r   r"   r   r_   _   s   
r_   c                   @   s   e Zd ZejdddZdS )
_CharMixinr)   c                 C   s   t  S r$   )r   r    r   r   r   get_text_processor   s    z_CharMixin.get_text_processorNr5   r6   r7   r   TextProcessorrj   r   r   r   r   rh      s   rh   c                   @   s"   e Zd ZddejdddZdS )_PhoneMixinNr;   ri   c                C   s
   t |dS Nr;   )r:   r@   r   r   r   rj      s    z_PhoneMixin.get_text_processorrk   r   r   r   r   rm      s   rm   c                   @   s:   e Zd ZU eed< eeef ed< ddedddZdS )_Tacotron2Mixin_tacotron2_path_tacotron2_paramsNr;   ri   c                C   sV   t f i | j}t d| j }|d u r,i n|}t|fi |}|| |  |S N/)r   rq   	_BASE_URLrp   r
   load_state_dictevalr!   r<   rL   urlZ
state_dictr   r   r   get_tacotron2   s    
z_Tacotron2Mixin.get_tacotron2)	r5   r6   r7   r0   __annotations__r   r   r   ry   r   r   r   r   ro      s   
ro   c                   @   sJ   e Zd ZU ee ed< eeeef  ed< ddddZddddZ	dS )	_WaveRNNMixin_wavernn_path_wavernn_paramsNr;   c                C   s   | j |d}t|S rn   )_get_wavernnrJ   )r!   r<   Zwavernnr   r   r   get_vocoder   s    z_WaveRNNMixin.get_vocoderc                C   sV   t f i | j}t d| j }|d u r,i n|}t|fi |}|| |  |S rr   )r   r}   rt   r|   r
   ru   rv   rw   r   r   r   r~      s    
z_WaveRNNMixin._get_wavernn)
r5   r6   r7   r   r0   rz   r   r   r   r~   r   r   r   r   r{      s   
r{   c                   @   s   e Zd Zdd ZdS )_GriffinLimMixinc                 K   s   t  S r$   )r_   )r!   _r   r   r   r      s    z_GriffinLimMixin.get_vocoderN)r5   r6   r7   r   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2WaveRNNCharBundleNr5   r6   r7   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2WaveRNNPhoneBundleNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2GriffinLimCharBundleNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2GriffinLimPhoneBundleNr   r   r   r   r   r      s   r   z5tacotron2_english_characters_1500_epochs_ljspeech.pth&   )Z	n_symbols)rp   rq   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z3tacotron2_english_phonemes_1500_epochs_ljspeech.pth`   a  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

z=tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pthz%wavernn_10k_epochs_8bits_ljspeech.pth)rp   rq   r|   r}   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z;tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.ptha  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>


Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
)4rD   Zdataclassesr   typingr   r   r   r   r   r   rW   r	   Ztorchaudio._internalr
   Ztorchaudio.functionalr   Ztorchaudio.modelsr   r   Ztorchaudio.transformsr   r   rC   r   Z	interfacer   __all__rt   rl   r   r:   nnModuleZVocoderrJ   r_   rh   rm   ro   r{   r   r   r   r   r   Z_get_taco_paramsZ"TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH__doc__Z#TACOTRON2_GRIFFINLIM_PHONE_LJSPEECHZ_get_wrnn_paramsZTACOTRON2_WAVERNN_CHAR_LJSPEECHZ TACOTRON2_WAVERNN_PHONE_LJSPEECHr   r   r   r   <module>   sn    &
	
#
(
%
