
    Ph=              	          d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d	d
lmZ d	dlmZ g ZdZ G d dej8                        Z G d dej8                        Z G d de
j>                  j@                  ejB                        Z" G d de
j>                  j@                  ejB                        Z# G d d      Z$ G d d      Z%e G d d             Z&e G d d             Z' G d d      Z(e G d d e'e&e$e             Z)e G d! d"e'e&e%e             Z*e G d# d$e(e&e$e             Z+e G d% d&e(e&e%e             Z, e+d' ejZ                  d()      *      Z.d+e._/         e,d, ejZ                  d-)      *      Z0d.e0_/         e)d/ ejZ                  d()      d0 ejb                         1      Z2d2e2_/         e*d3 ejZ                  d-)      d0 ejb                         1      Z3d4e3_/        y)5    N)	dataclass)AnyDictListOptionalTupleUnion)Tensor)load_state_dict_from_url)mu_law_decoding)	Tacotron2WaveRNN)
GriffinLimInverseMelScale   )utils)Tacotron2TTSBundlez.https://download.pytorch.org/torchaudio/modelsc                   X     e Zd Z fdZed        Zdeeee   f   de	e
e
f   fdZ xZS )_EnglishCharProcessorc                     t         |           t        j                         | _        t        | j                        D ci c]  \  }}||
 c}}| _        y c c}}w N)super__init__r   
_get_chars_tokens	enumerate_mapping)selfis	__class__s      iC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torchaudio/pipelines/_tts/impl.pyr   z_EnglishCharProcessor.__init__   sJ    '')*3DLL*AB*A$!QA*ABBs   Ac                     | j                   S r   r   r   s    r"   tokensz_EnglishCharProcessor.tokens       ||    textsreturnc           	          t        |t              r|g}|D cg c]<  }|j                         D cg c]   }|| j                  v s| j                  |   " c}> }}}t	        j
                  |      S c c}w c c}}w r   )
isinstancestrlowerr   r   
_to_tensor)r   r)   tcindicess        r"   __call__z_EnglishCharProcessor.__call__#   sm    eS!GEX]^X]STaggiNi1;MDMM!$iNX]^(( O^s   A7A2A2A72A7__name__
__module____qualname__r   propertyr&   r	   r-   r   r   r
   r3   __classcell__r!   s   @r"   r   r      sF    C
  )eCcN3 )ffn8M )r(   r   c                   ^     e Zd Zdd fd
Zed        Zdeeee   f   de	e
e
f   fdZ xZS )_EnglishPhoneProcessorN	dl_kwargsc                   t         |           t        j                         | _        t        | j                        D ci c]  \  }}||
 c}}| _        t        j                  d|      | _        d| _	        y c c}}w )Nzen_us_cmudict_forward.ptr=   z(\[[A-Z]+?\]|[_!'(),.:;? -]))
r   r   r   _get_phonesr   r   r   _load_phonemizer_phonemizer_pattern)r   r>   r   pr!   s       r"   r   z_EnglishPhoneProcessor.__init__+   sk    ((**3DLL*AB*A$!QA*AB 112LXab7 Cs   A;c                     | j                   S r   r$   r%   s    r"   r&   z_EnglishPhoneProcessor.tokens2   r'   r(   r)   r*   c           	      v   t        |t              r|g}g }| j                  |d      D ]p  }t        j                  | j
                  |      D cg c]  }t        j                  dd|       }}|j                  |D cg c]  }| j                  |    c}       r t        j                  |      S c c}w c c}w )Nen_us)langz[\[\]] )r,   r-   rB   refindallrC   subappendr   r   r/   )r   r)   r2   phonesrretrD   s          r"   r3   z_EnglishPhoneProcessor.__call__6   s    eS!GE&&u7&;F57ZZv5VW5V266)R+5VCWNNc:cDMM!,c:; < (( X:s   B1=B6
r4   r:   s   @r"   r<   r<   *   sH    $( 8  	)eCcN3 	)ffn8M 	)r(   r<   c                   H     e Zd Zddedee   f fdZed        ZddZ	 xZ
S )_WaveRNNVocodermodelmin_level_dbc                 L    t         |           d| _        || _        || _        y )N"V  )r   r   _sample_rate_model_min_level_db)r   rS   rT   r!   s      r"   r   z_WaveRNNVocoder.__init__H   s%    !)r(   c                     | j                   S r   rW   r%   s    r"   sample_ratez_WaveRNNVocoder.sample_rateN          r(   c                    t        j                  |      }dt        j                  t        j                  |d            z  }| j                  4| j                  |z
  | j                  z  }t        j                  |dd      }| j
                  j                  ||      \  }}t        j                  || j
                  j                        }t        || j
                  j                        }|j                  d      }||fS )N   gh㈵>)minr   r   )r`   max)torchexplog10clamprY   rX   inferr   _unnormalize_waveformn_bitsr   	n_classessqueeze)r   mel_speclengthswaveforms       r"   forwardz_WaveRNNVocoder.forwardR   s    99X&EKKd$CDD)**X59K9KKH{{8:H KK--h@'..x9K9KL"8T[[-B-BC##A&  r(   )ir   )r5   r6   r7   r   r   floatr   r8   r\   rn   r9   r:   s   @r"   rR   rR   G   s3    *g *Xe_ * ! !
!r(   rR   c                   6     e Zd Z fdZed        ZddZ xZS )_GriffinLimVocoderc           	          t         |           d| _        t        dd| j                  dddd      | _        t        dd	d
d      | _        y )NrV   i  P   g        g     @@slaney)n_stftn_melsr\   f_minf_max	mel_scalenormi   r      )n_fftpower
hop_length
win_length)r   r   rW   r   r\   _inv_melr   _griffin_lim)r   r!   s    r"   r   z_GriffinLimVocoder.__init__`   sX    !'!((
 '	
r(   c                     | j                   S r   r[   r%   s    r"   r\   z_GriffinLimVocoder.sample_rates   r]   r(   c                    t        j                  |      }|j                         j                         j	                  d      }| j                  |      }|j                         j	                  d      }| j                  |      }||fS )NTF)rb   rc   clonedetachrequires_grad_r   r   )r   rk   rl   spec	waveformss        r"   rn   z_GriffinLimVocoder.forwardw   so    99X&>>#**,;;DA}}X&{{}++E2%%d+	'!!r(   r   )r5   r6   r7   r   r8   r\   rn   r9   r:   s   @r"   rq   rq   _   s!    
& ! !"r(   rq   c                   ,    e Zd Zdej                  fdZy)
_CharMixinr*   c                     t               S r   )r   r%   s    r"   get_text_processorz_CharMixin.get_text_processor   s    $&&r(   Nr5   r6   r7   r   TextProcessorr    r(   r"   r   r      s    '$6$D$D 'r(   r   c                   2    e Zd Zdddej                  fdZy)_PhoneMixinNr=   r*   c                    t        |      S Nr=   )r<   )r   r>   s     r"   r   z_PhoneMixin.get_text_processor   s    %	::r(   r   r   r(   r"   r   r      s    .2 ;7I7W7W ;r(   r   c                   >    e Zd ZU eed<   eeef   ed<   dddefdZy)_Tacotron2Mixin_tacotron2_path_tacotron2_paramsNr=   r*   c                    t        di | j                  }t         d| j                   }|i n|}t	        |fi |}|j                  |       |j                          |S N/r   )r   r   	_BASE_URLr   r   load_state_dictevalr   r>   rS   url
state_dicts        r"   get_tacotron2z_Tacotron2Mixin.get_tacotron2   sc    3D2231T1123#+B	-c?Y?
j)

r(   )	r5   r6   r7   r-   __annotations__r   r   r   r   r   r(   r"   r   r      s%    CH~%)- ) r(   r   c                   P    e Zd ZU ee   ed<   eeeef      ed<   dddZdddZ	y)_WaveRNNMixin_wavernn_path_wavernn_paramsNr=   c                <    | j                  |      }t        |      S r   )_get_wavernnrR   )r   r>   wavernns      r"   get_vocoderz_WaveRNNMixin.get_vocoder   s     ##i#8w''r(   c                    t        di | j                  }t         d| j                   }|i n|}t	        |fi |}|j                  |       |j                          |S r   )r   r   r   r   r   r   r   r   s        r"   r   z_WaveRNNMixin._get_wavernn   sc    /$../1T//01#+B	-c?Y?
j)

r(   )
r5   r6   r7   r   r-   r   r   r   r   r   r   r(   r"   r   r      s1    C= d38n--'+ ( )- r(   r   c                       e Zd Zd Zy)_GriffinLimMixinc                     t               S r   )rq   )r   _s     r"   r   z_GriffinLimMixin.get_vocoder   s    !##r(   N)r5   r6   r7   r   r   r(   r"   r   r      s    $r(   r   c                       e Zd Zy)_Tacotron2WaveRNNCharBundleNr5   r6   r7   r   r(   r"   r   r          r(   r   c                       e Zd Zy)_Tacotron2WaveRNNPhoneBundleNr   r   r(   r"   r   r      r   r(   r   c                       e Zd Zy)_Tacotron2GriffinLimCharBundleNr   r   r(   r"   r   r      r   r(   r   c                       e Zd Zy)_Tacotron2GriffinLimPhoneBundleNr   r   r(   r"   r   r      r   r(   r   z5tacotron2_english_characters_1500_epochs_ljspeech.pth&   )	n_symbols)r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z3tacotron2_english_phonemes_1500_epochs_ljspeech.pth`   a  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

z=tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pthz%wavernn_10k_epochs_8bits_ljspeech.pth)r   r   r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z;tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.ptha  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>


Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
)4rJ   dataclassesr   typingr   r   r   r   r   r	   rb   r
   torchaudio._internalr   torchaudio.functionalr   torchaudio.modelsr   r   torchaudio.transformsr   r   rI   r   	interfacer   __all__r   r   r   r<   nnModuleVocoderrR   rq   r   r   r   r   r   r   r   r   r   _get_taco_params"TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH__doc__#TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_get_wrnn_paramsTACOTRON2_WAVERNN_CHAR_LJSPEECH TACOTRON2_WAVERNN_PHONE_LJSPEECHr   r(   r"   <module>r      sB   	 ! : :   9 1 0 =  )
<	).<< )")/== ):!ehhoo'9'A'A !0"*<*D*D "L' '
; ;
      $$ $ 	-*N` 	 	 	=/;Pb 	 	 	%5
Tf 	 	 	&6Vh 	 	 &DK,e,,r:& "!. " *F 'FI,e,,r:' #&/ # +P #>S,e,,r:9*E**,	# #+  'J $@Q,e,,r:9*E**,	$  ),   (r(   