
    ~Vji<              	       (   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d	d
lmZ d	dlmZ g ZdZ G d dej                  Z G d dej                  Z G d de
j        j         ej!                  Z" G d de
j        j         ej!                  Z# G d d          Z$ G d d          Z%e G d d                      Z&e G d d                      Z' G d d          Z(e G d d e'e&e$e                      Z)e G d! d"e'e&e%e                      Z*e G d# d$e(e&e$e                      Z+e G d% d&e(e&e%e                      Z, e+d' ej-        d()          *          Z.d+e._/         e,d, ej-        d-)          *          Z0d.e0_/         e)d/ ej-        d()          d0 ej1                    1          Z2d2e2_/         e*d3 ej-        d-)          d0 ej1                    1          Z3d4e3_/        dS )5    N)	dataclass)AnyDictListOptionalTupleUnion)Tensor)load_state_dict_from_url)mu_law_decoding)	Tacotron2WaveRNN)
GriffinLimInverseMelScale   )utils)Tacotron2TTSBundlez.https://download.pytorch.org/torchaudio/modelsc                   p     e Zd Z fdZed             Zdeeee         f         de	e
e
f         fdZ xZS )_EnglishCharProcessorc                     t                                                       t          j                    | _        d t          | j                  D             | _        d S )Nc                     i | ]\  }}||	S  r   ).0iss      X/root/voice-cloning/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/impl.py
<dictcomp>z2_EnglishCharProcessor.__init__.<locals>.<dictcomp>       BBB$!QABBB    )super__init__r   
_get_chars_tokens	enumerate_mappingself	__class__s    r   r!   z_EnglishCharProcessor.__init__   sL    '))BB)DL*A*ABBBr   c                     | j         S Nr#   r'   s    r   tokensz_EnglishCharProcessor.tokens   
    |r   textsreturnc                 x     t          |t                    r|g} fd|D             }t          j        |          S )Nc                 P    g | ]"}fd |                                 D             #S )c                 <    g | ]}|j         v j         |         S r   r%   )r   cr'   s     r   
<listcomp>z=_EnglishCharProcessor.__call__.<locals>.<listcomp>.<listcomp>&   s,    NNN1;M;MDM!$;M;M;Mr   )lower)r   tr'   s     r   r6   z2_EnglishCharProcessor.__call__.<locals>.<listcomp>&   s7    ^^^STNNNNaggiiNNN^^^r   )
isinstancestrr   
_to_tensor)r'   r/   indicess   `  r   __call__z_EnglishCharProcessor.__call__#   sG    eS!! 	GE^^^^X]^^^(((r   __name__
__module____qualname__r!   propertyr-   r	   r:   r   r   r
   r=   __classcell__r(   s   @r   r   r      s        C C C C C
   X)eCcN3 )ffn8M ) ) ) ) ) ) ) )r   r   c                   v     e Zd Zdd fd
Zed             Zdeeee         f         de	e
e
f         fdZ xZS )_EnglishPhoneProcessorN	dl_kwargsc                   t                                                       t          j                    | _        d t          | j                  D             | _        t          j        d|          | _        d| _	        d S )Nc                     i | ]\  }}||	S r   r   )r   r   ps      r   r   z3_EnglishPhoneProcessor.__init__.<locals>.<dictcomp>.   r   r   zen_us_cmudict_forward.ptrG   z(\[[A-Z]+?\]|[_!'(),.:;? -]))
r    r!   r   _get_phonesr#   r$   r%   _load_phonemizer_phonemizer_pattern)r'   rH   r(   s     r   r!   z_EnglishPhoneProcessor.__init__+   sm    (**BB)DL*A*ABBB 12LXabbb7r   c                     | j         S r*   r+   r,   s    r   r-   z_EnglishPhoneProcessor.tokens2   r.   r   r/   r0   c                     t          |t                    r|g}g }                     |d          D ]G}d t          j         j        |          D             }|                     fd|D                        Ht          j        |          S )Nen_us)langc                 :    g | ]}t          j        d d|          S )z[\[\]] )resub)r   rs     r   r6   z3_EnglishPhoneProcessor.__call__.<locals>.<listcomp>=   s&    WWW26)R++WWWr   c                 *    g | ]}j         |         S r   r4   )r   rK   r'   s     r   r6   z3_EnglishPhoneProcessor.__call__.<locals>.<listcomp>>   s     :::DM!,:::r   )	r9   r:   rN   rV   findallrO   appendr   r;   )r'   r/   r<   phonesrets   `    r   r=   z_EnglishPhoneProcessor.__call__6   s    eS!! 	GE&&u7&;; 	< 	<FWWRZv5V5VWWWCNN::::c:::;;;;(((r   r>   rD   s   @r   rF   rF   *   s        $( 8 8 8 8 8 8 8   X	)eCcN3 	)ffn8M 	) 	) 	) 	) 	) 	) 	) 	)r   rF   c                   T     e Zd Zddedee         f fdZed             Zd	dZ	 xZ
S )
_WaveRNNVocodermodelmin_level_dbc                 r    t                                                       d| _        || _        || _        d S )N"V  )r    r!   _sample_rate_model_min_level_db)r'   ra   rb   r(   s      r   r!   z_WaveRNNVocoder.__init__H   s6    !)r   c                     | j         S r*   re   r,   s    r   sample_ratez_WaveRNNVocoder.sample_rateN         r   Nc                    t          j        |          }dt          j        t          j        |d                    z  }| j        )| j        |z
  | j        z  }t          j        |dd          }| j                            ||          \  }}t          j        || j        j	                  }t          || j        j                  }|                    d          }||fS )N   gh㈵>)minr   r   )rn   max)torchexplog10clamprg   rf   inferr   _unnormalize_waveformn_bitsr   	n_classessqueeze)r'   mel_speclengthswaveforms       r   forwardz_WaveRNNVocoder.forwardR   s    9X&&EKd$C$C$CDDD)*X59KKH{8:::H K--h@@'.x9KLL"8T[-BCC##A&&  r   )r`   r*   )r?   r@   rA   r   r   floatr!   rB   rj   r|   rC   rD   s   @r   r_   r_   G   s        * *g *Xe_ * * * * * * ! ! X!
! 
! 
! 
! 
! 
! 
! 
!r   r_   c                   <     e Zd Z fdZed             ZddZ xZS )_GriffinLimVocoderc           	          t                                                       d| _        t          dd| j        dddd          | _        t          dd	d
d          | _        d S )Nrd   i  P   g        g     @@slaney)n_stftn_melsrj   f_minf_max	mel_scalenormi   r      )n_fftpower
hop_length
win_length)r    r!   re   r   rj   _inv_melr   _griffin_limr&   s    r   r!   z_GriffinLimVocoder.__init__`   sz    !'!(
 
 
 '	
 
 
r   c                     | j         S r*   ri   r,   s    r   rj   z_GriffinLimVocoder.sample_rates   rk   r   Nc                 F   t          j        |          }|                                                                                    d          }|                     |          }|                                                    d          }|                     |          }||fS )NTF)rp   rq   clonedetachrequires_grad_r   r   )r'   ry   rz   spec	waveformss        r   r|   z_GriffinLimVocoder.forwardw   s    9X&&>>##**,,;;DAA}}X&&{{}}++E22%%d++	'!!r   r*   )r?   r@   rA   r!   rB   rj   r|   rC   rD   s   @r   r   r   _   sg        
 
 
 
 
& ! ! X!" " " " " " " "r   r   c                   $    e Zd Zdej        fdZdS )
_CharMixinr0   c                     t                      S r*   )r   r,   s    r   get_text_processorz_CharMixin.get_text_processor   s    $&&&r   Nr?   r@   rA   r   TextProcessorr   r   r   r   r   r      s3        '$6$D ' ' ' ' ' 'r   r   c                   *    e Zd Zdddej        fdZdS )_PhoneMixinNrG   r0   c                "    t          |          S NrG   )rF   )r'   rH   s     r   r   z_PhoneMixin.get_text_processor   s    %	::::r   r   r   r   r   r   r      s@        .2 ; ; ;7I7W ; ; ; ; ; ;r   r   c                   F    e Zd ZU eed<   eeef         ed<   dddefdZdS )_Tacotron2Mixin_tacotron2_path_tacotron2_paramsNrG   r0   c                    t          di | j        }t           d| j         }|i n|}t	          |fi |}|                    |           |                                 |S N/r   )r   r   	_BASE_URLr   r   load_state_dictevalr'   rH   ra   url
state_dicts        r   get_tacotron2z_Tacotron2Mixin.get_tacotron2   sw    33D23333T133#+BB	-c??Y??
j)))

r   )	r?   r@   rA   r:   __annotations__r   r   r   r   r   r   r   r   r      s^         CH~%%%)-   )      r   r   c                   d    e Zd ZU ee         ed<   eeeef                  ed<   dddZdddZ	dS )_WaveRNNMixin_wavernn_path_wavernn_paramsNrG   c                L    |                      |          }t          |          S r   )_get_wavernnr_   )r'   rH   wavernns      r   get_vocoderz_WaveRNNMixin.get_vocoder   s&    ##i#88w'''r   c                    t          di | j        }t           d| j         }|i n|}t	          |fi |}|                    |           |                                 |S r   )r   r   r   r   r   r   r   r   s        r   r   z_WaveRNNMixin._get_wavernn   sw    //$.//11T/11#+BB	-c??Y??
j)))

r   )
r?   r@   rA   r   r:   r   r   r   r   r   r   r   r   r   r      sy         C=   d38n----'+ ( ( ( ( ( )-       r   r   c                       e Zd Zd ZdS )_GriffinLimMixinc                     t                      S r*   )r   )r'   _s     r   r   z_GriffinLimMixin.get_vocoder   s    !###r   N)r?   r@   rA   r   r   r   r   r   r      s#        $ $ $ $ $r   r   c                       e Zd ZdS )_Tacotron2WaveRNNCharBundleNr?   r@   rA   r   r   r   r   r              Dr   r   c                       e Zd ZdS )_Tacotron2WaveRNNPhoneBundleNr   r   r   r   r   r      r   r   r   c                       e Zd ZdS )_Tacotron2GriffinLimCharBundleNr   r   r   r   r   r      r   r   r   c                       e Zd ZdS )_Tacotron2GriffinLimPhoneBundleNr   r   r   r   r   r      r   r   r   z5tacotron2_english_characters_1500_epochs_ljspeech.pth&   )	n_symbols)r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z3tacotron2_english_phonemes_1500_epochs_ljspeech.pth`   a  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

z=tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pthz%wavernn_10k_epochs_8bits_ljspeech.pth)r   r   r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z;tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.ptha  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>


Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
)4rV   dataclassesr   typingr   r   r   r   r   r	   rp   r
   torchaudio._internalr   torchaudio.functionalr   torchaudio.modelsr   r   torchaudio.transformsr   r   rU   r   	interfacer   __all__r   r   r   rF   nnModuleVocoderr_   r   r   r   r   r   r   r   r   r   r   _get_taco_params"TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH__doc__#TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_get_wrnn_paramsTACOTRON2_WAVERNN_CHAR_LJSPEECH TACOTRON2_WAVERNN_PHONE_LJSPEECHr   r   r   <module>r      s   				 ! ! ! ! ! ! : : : : : : : : : : : : : : : :        9 9 9 9 9 9 1 1 1 1 1 1 0 0 0 0 0 0 0 0 = = = = = = = =       ) ) ) ) ) )
<	) ) ) ) ).< ) ) )") ) ) ) )/= ) ) ):! ! ! ! !eho'9'A ! ! !0" " " " "*<*D " " "L' ' ' ' ' ' ' '
; ; ; ; ; ; ; ;
                $$ $ $ $ $ $ $ $ 	 	 	 	 	-*N` 	 	 	 	 	 	 	 	=/;Pb 	 	 	 	 	 	 	 	%5
Tf 	 	 	 	 	 	 	 	&6Vh 	 	 	 &D%CK,e,r:::& & & "!. " *F 'F&EI,e,r:::' ' ' #&/ # +P #>"=S,e,r:::9*E*,,	# # # #+  'J $@#?Q,e,r:::9*E*,,	$ $ $  ),   ( ( (r   