
    Ph[                    V   d dl Z d dlZd dlmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ g Z G d d	ej0                  j2                        Z G d
 dej0                  j2                        Z G d dej0                  j2                        Z G d dej0                  j2                        Z G d dej0                  j2                        Z G d dej0                  j2                        Z G d dej0                  j2                        Z  G d dej0                  j2                        Z! G d dej0                  j2                        Z" G d dej0                  j2                        Z# G d dej0                  j2                        Z$ G d dej0                  j2                        Z% G d  d!ej0                  j2                        Z& G d" d#ej0                  j2                        Z' G d$ d%ej0                  j2                        Z( G d& d'ej0                  j2                        Z) G d( d)e)      Z* G d* d+e)      Z+ G d, d-ej0                  j2                        Z, G d. d/ej0                  j2                        Z- G d0 d1ej0                  j2                        Z. G d2 d3ej0                  j2                        Z/ G d4 d5ej0                  j2                        Z0 G d6 d7ej0                  j2                        Z1 G d8 d9eej0                  j2                        Z2 G d: d;ej0                  j2                        Z3 G d< d=ej0                  j2                        Z4 G d> d?ej0                  j2                        Z5d@e6dAe7dBee6e6f   fdCZ8 G dD dEej0                  j2                        Z9 G dF dGej0                  j2                        Z: G dH dIej0                  j2                        Z; G dJ dKej0                  j2                        Z< G dL dMej0                  j2                        Z=y)N    N)CallableOptionalSequenceTupleUnion)Tensor)LazyModuleMixin)UninitializedParameter)
functional)_apply_sinc_resample_kernel_check_convolve_mode_fix_waveform_shape_get_sinc_resample_kernel_stretch_waveformc                        e Zd ZdZg dZddddej                  ddddd	ddfd
edee   dee   dede	de
f   dee   deeef   dee   dedededee   ddf fdZde
de
fdZ xZS )Spectrograma  Create a spectrogram from a audio signal.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
        win_length (int or None, optional): Window size. (Default: ``n_fft``)
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        pad (int, optional): Two sided padding of signal. (Default: ``0``)
        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
        power (float or None, optional): Exponent for the magnitude spectrogram,
            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
            If None, then the complex spectrum is returned instead. (Default: ``2``)
        normalized (bool or str, optional): Whether to normalize by magnitude after stft. If input is str, choices are
            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
            ``"window"``. (Default: ``False``)
        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
        center (bool, optional): whether to pad :attr:`waveform` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. (Default: ``"reflect"``)
        onesided (bool, optional): controls whether to return half of results to
            avoid redundancy (Default: ``True``)
        return_complex (bool, optional):
            Deprecated and not used.

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = torchaudio.transforms.Spectrogram(n_fft=800)
        >>> spectrogram = transform(waveform)

    n_fft
win_length
hop_lengthpadpower
normalized  Nr          @FTreflectr   r   r   r   	window_fn.r   r   wkwargscenterpad_modeonesidedreturn_complexreturnc                    t         t        |           t        j                  j                  d       || _        ||n|| _        ||n| j                  dz  | _        | || j                        n || j                  fi |}| j                  d|       || _
        || _        || _        |	| _        |
| _        || _        |t!        j"                  d       y y )Nz!torchaudio.transforms.Spectrogram   windowz`return_complex` argument is now deprecated and is not effective.`torchaudio.transforms.Spectrogram(power=None)` always returns a tensor with complex dtype. Please remove the argument in the function call.)superr   __init__torch_C_log_api_usage_oncer   r   r   register_bufferr   r   r   r   r    r!   warningswarn)selfr   r   r   r   r   r   r   r   r   r    r!   r"   r&   	__class__s                 lC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torchaudio/transforms/_transforms.pyr(   zSpectrogram.__init__?   s     	k4)+$$%HI
 )3(>*E(2(>*DOOWXDX/64??+IdooDiahDiXv.
$  %MMR &    waveformc                    t        j                  || j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                        S )a<  
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Dimension (..., freq, time), where freq is
            ``n_fft // 2 + 1`` where ``n_fft`` is the number of
            Fourier bins, and time is the number of window hops (n_frame).
        )Fspectrogramr   r&   r   r   r   r   r   r   r    r!   r/   r3   s     r1   forwardzSpectrogram.forwardd   s^     }}HHKKJJOOOOJJOOKKMMMM
 	
r2   )__name__
__module____qualname____doc____constants__r)   hann_windowintr   r   r   floatr   boolstrdictr(   r8   __classcell__r0   s   @r1   r   r      s    #H XM $($(+0+<+<!$',"&!)-## SM# SM	#
 # CK(# # $)$# $# # # # !# 
#J
 
6 
r2   r   c                        e Zd ZdZg dZddddej                  dddddf
d	ed
ee   dee   dede	de
f   deeef   dee   dedededdf fdZdde
dee   de
fdZ xZS )InverseSpectrogramaJ  Create an inverse spectrogram to recover an audio signal from a spectrogram.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
        win_length (int or None, optional): Window size. (Default: ``n_fft``)
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        pad (int, optional): Two sided padding of signal. (Default: ``0``)
        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
        normalized (bool or str, optional): Whether the stft output was normalized by magnitude. If input is str,
            choices are ``"window"`` and ``"frame_length"``, dependent on normalization mode. ``True`` maps to
            ``"window"``. (Default: ``False``)
        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
        center (bool, optional): whether the signal in spectrogram was padded on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. (Default: ``"reflect"``)
        onesided (bool, optional): controls whether spectrogram was used to return half of results to
            avoid redundancy (Default: ``True``)

    Example
        >>> batch, freq, time = 2, 257, 100
        >>> length = 25344
        >>> spectrogram = torch.randn(batch, freq, time, dtype=torch.cdouble)
        >>> transform = transforms.InverseSpectrogram(n_fft=512)
        >>> waveform = transform(spectrogram, length)
    r   r   Nr   FTr   r   r   r   r   r   .r   r   r   r    r!   r#   c                 6   t         t        |           || _        ||n|| _        ||n| j                  dz  | _        | || j                        n || j                  fi |}| j                  d|       || _        || _        || _	        |	| _
        |
| _        y Nr%   r&   )r'   rG   r(   r   r   r   r,   r   r   r   r    r!   )r/   r   r   r   r   r   r   r   r   r    r!   r&   r0   s               r1   r(   zInverseSpectrogram.__init__   s     	 $02
 )3(>*E(2(>*DOOWXDX/64??+IdooDiahDiXv.$  r2   r6   lengthc                     t        j                  ||| j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                        S )a.  
        Args:
            spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
            length (int or None, optional): The output length of the waveform.

        Returns:
            Tensor: Dimension (..., time), Least squares estimation of the original signal.
        )r5   inverse_spectrogramr   r&   r   r   r   r   r   r    r!   )r/   r6   rJ   s      r1   r8   zInverseSpectrogram.forward   s\     $$HHKKJJOOOOOOKKMMMM
 	
r2   N)r9   r:   r;   r<   r=   r)   r>   r?   r   r   r   r   rA   rB   rC   r(   r8   rD   rE   s   @r1   rG   rG   }   s    @ XM $($(+0+<+<',"&!!! SM! SM	!
 ! CK(! $)$! $! ! ! ! 
!6
6 
8C= 
F 
r2   rG   c                        e Zd ZdZg dZddddej                  dddddf
d	ed
edee   dee   de	de
f   dedee   dedee   deddf fdZde
de
fdZ xZS )
GriffinLimai  Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Implementation ported from
    *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
    and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.

    Args:
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
        n_iter (int, optional): Number of iteration for phase recovery process. (Default: ``32``)
        win_length (int or None, optional): Window size. (Default: ``n_fft``)
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
        power (float, optional): Exponent for the magnitude spectrogram,
            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)
        momentum (float, optional): The momentum parameter for fast Griffin-Lim.
            Setting this to 0 recovers the original Griffin-Lim method.
            Values near 1 can lead to faster convergence, but above 1 may not converge. (Default: ``0.99``)
        length (int, optional): Array length of the expected output. (Default: ``None``)
        rand_init (bool, optional): Initializes phase randomly if True and to zero otherwise. (Default: ``True``)

    Example
        >>> batch, freq, time = 2, 257, 100
        >>> spectrogram = torch.randn(batch, freq, time)
        >>> transform = transforms.GriffinLim(n_fft=512)
        >>> waveform = transform(spectrogram)
    )r   n_iterr   r   r   rJ   momentum	rand_initr       Nr   Gz?Tr   rP   r   r   r   .r   r   rQ   rJ   rR   r#   c                    t         t        |           d|cxk  rdk  sn t        dj	                  |            || _        || _        ||n|| _        ||n| j                  dz  | _        | || j                        n || j                  fi |}| j                  d|       |	| _
        || _        || _        |
| _        y )Nr      z/momentum must be in the range [0, 1). Found: {}r%   r&   )r'   rO   r(   
ValueErrorformatr   rP   r   r   r,   rJ   r   rQ   rR   )r/   r   rP   r   r   r   r   r   rQ   rJ   rR   r&   r0   s               r1   r(   zGriffinLim.__init__   s     	j$(*X!!NUUV^_``
(2(>*E(2(>*DOOWXDX/64??+IdooDiahDiXv.
 "r2   specgramc                     t        j                  || j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  
      S )a/  
        Args:
            specgram (Tensor):
                A magnitude-only STFT spectrogram of dimension (..., freq, frames)
                where freq is ``n_fft // 2 + 1``.

        Returns:
            Tensor: waveform of (..., time), where time equals the ``length`` parameter if given.
        )r5   
griffinlimr&   r   r   r   r   rP   rQ   rJ   rR   r/   rY   s     r1   r8   zGriffinLim.forward  sW     ||KKJJOOOOJJKKMMKKNN
 	
r2   )r9   r:   r;   r<   r=   r)   r>   r?   r   r   r   r@   rC   rA   r(   r8   rD   rE   s   @r1   rO   rO      s    @ pM $($(+0+<+<"& $## # SM	#
 SM# CK(# # $# # # # 
#:
 
6 
r2   rO   c                   P     e Zd ZdZg dZd
dedee   ddf fdZde	de	fd	Z
 xZS )AmplitudeToDBaW  Turn a tensor from the power/amplitude scale to the decibel scale.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    This output depends on the maximum value in the input tensor, and so
    may return different values for an audio clip split into snippets vs. a
    a full clip.

    Args:
        stype (str, optional): scale of input tensor (``"power"`` or ``"magnitude"``). The
            power being the elementwise square of the magnitude. (Default: ``"power"``)
        top_db (float or None, optional): minimum negative cut-off in decibels.  A reasonable
            number is 80. (Default: ``None``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.AmplitudeToDB(stype="amplitude", top_db=80)
        >>> waveform_db = transform(waveform)
    )
multiplieramin	ref_valuedb_multiplierNstypetop_dbr#   c                    t         t        |           || _        ||dk  rt	        d      || _        |dk(  rdnd| _        d| _        d| _        t        j                  t        | j                  | j                              | _        y )Nr   ztop_db must be positive valuer   g      $@      4@g|=      ?)r'   r^   r(   rc   rW   rd   r_   r`   ra   mathlog10maxrb   )r/   rc   rd   r0   s      r1   r(   zAmplitudeToDB.__init__C  st    mT+-
&1*<=="'7"2$	!ZZDIIt~~(FGr2   xc                     t        j                  || j                  | j                  | j                  | j
                        S )a*  Numerically stable implementation from Librosa.

        https://librosa.org/doc/latest/generated/librosa.amplitude_to_db.html

        Args:
            x (Tensor): Input tensor before being converted to decibel scale.

        Returns:
            Tensor: Output tensor in decibel scale.
        )r5   amplitude_to_DBr_   r`   rb   rd   r/   rk   s     r1   r8   zAmplitudeToDB.forwardN  s2       DOOTYY@R@RTXT_T_``r2   )r   N)r9   r:   r;   r<   r=   rB   r   r@   r(   r   r8   rD   rE   s   @r1   r^   r^   +  sI    * IM	Hc 	HXe_ 	HPT 	Ha aF ar2   r^   c                   x     e Zd ZdZg dZ	 	 	 	 	 	 	 ddedededee   ded	ee   d
eddf fdZ	de
de
fdZ xZS )MelScaleaf  Turn a normal STFT into a mel frequency STFT with triangular filter banks.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
        f_min (float, optional): Minimum frequency. (Default: ``0.``)
        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
        norm (str or None, optional): If ``"slaney"``, divide the triangular mel weights by the width of the mel band
            (area normalization). (Default: ``None``)
        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
        >>> spectrogram = spectrogram_transform(waveform)
        >>> melscale_transform = transforms.MelScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
        >>> melscale_spectrogram = melscale_transform(spectrogram)

    See also:
        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
        generate the filter banks.
    )n_melssample_ratef_minf_maxNrq   rr   rs   rt   n_stftnorm	mel_scaler#   c           	         t         t        |           || _        || _        ||nt        |dz        | _        || _        || _        || _	        || j                  kD  r%t        dj                  || j                              t        j                  || j                  | j                  | j                  | j                  | j                  | j                        }| j                  d|       y )Nr%   Require f_min: {} <= f_max: {}fb)r'   rp   r(   rq   rr   r@   rt   rs   rv   rw   rW   rX   r5   melscale_fbanksr,   )
r/   rq   rr   rs   rt   ru   rv   rw   rz   r0   s
            r1   r(   zMelScale.__init__z  s     	h&(&#/UU;!;K5L

	"4::=DDUDJJWXXvtzz4::t{{DL\L\^b^g^gimiwiwxT2&r2   rY   c                     t        j                  |j                  dd      | j                        j                  dd      }|S )z
        Args:
            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).

        Returns:
            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
        )r)   matmul	transposerz   )r/   rY   mel_specgrams      r1   r8   zMelScale.forward  s:     ||H$6$6r2$>HRRSUWYZr2   )   >          N   Nhtkr9   r:   r;   r<   r=   r?   r@   r   rB   r(   r   r8   rD   rE   s   @r1   rp   rp   \  s    6 @M  !%"'' ' 	'
 ' ' sm' ' 
'0 6 r2   rp   c                   |     e Zd ZdZg dZ	 	 	 	 	 	 	 ddededededee   d	ee   d
ededdf fdZ	de
de
fdZ xZS )InverseMelScalea  Estimate a STFT in normal frequency domain from mel frequency domain.

    .. devices:: CPU CUDA

    It minimizes the euclidian norm between the input mel-spectrogram and the product between
    the estimated spectrogram and the filter banks using `torch.linalg.lstsq`.

    Args:
        n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
        f_min (float, optional): Minimum frequency. (Default: ``0.``)
        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
            (area normalization). (Default: ``None``)
        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
        driver (str, optional): Name of the LAPACK/MAGMA method to be used for `torch.lstsq`.
            For CPU inputs the valid values are ``"gels"``, ``"gelsy"``, ``"gelsd"``, ``"gelss"``.
            For CUDA input, the only valid driver is ``"gels"``, which assumes that A is full-rank.
            (Default: ``"gels``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> mel_spectrogram_transform = transforms.MelSpectrogram(sample_rate, n_fft=1024)
        >>> mel_spectrogram = mel_spectrogram_transform(waveform)
        >>> inverse_melscale_transform = transforms.InverseMelScale(n_stft=1024 // 2 + 1)
        >>> spectrogram = inverse_melscale_transform(mel_spectrogram)
    )ru   rq   rr   rs   rt   Nru   rq   rr   rs   rt   rv   rw   driverr#   c	           	         t         t        |           || _        || _        |xs t        |dz        | _        || _        || _        || j                  kD  r%t        dj                  || j                              |dvrt        d| d      t        j                  || j                  | j                  | j                  | j                  ||      }	| j                  d|	       y )Nr%   ry   )gelsgelsygelsdgelsszAdriver must be one of ["gels", "gelsy", "gelsd", "gelss"]. Found .rz   )r'   r   r(   rq   rr   r@   rt   rs   r   rW   rX   r5   r{   r,   )r/   ru   rq   rr   rs   rt   rv   rw   r   rz   r0   s             r1   r(   zInverseMelScale.__init__  s     	ot-/&5eK1$45

4::=DDUDJJWXX<<`ag`hhijkkvtzz4::t{{DL\L\^bdmnT2&r2   melspecc                    |j                         }|j                  d|d   |d         }|d   |d   }}| j                  j                         \  }}| j                  |k7  r%t	        dj                  | j                  |            t        j                  t        j                  j                  | j                  j                  dd      d   || j                        j                        }|j                  |dd ||fz         }|S )z
        Args:
            melspec (Tensor): A Mel frequency spectrogram of dimension (..., ``n_mels``, time)

        Returns:
            Tensor: Linear scale spectrogram of size (..., freq, time)
        r}   r~   z-Expected an input with {} mel bins. Found: {}N)r   )sizeviewrz   rq   rW   rX   r)   relulinalglstsqr   r   solution)r/   r   shaperq   timefreq_rY   s           r1   r8   zInverseMelScale.forward  s     ,,r59eBi8Ry%)'',,.a;;& LSSTXT_T_aghii::ell001B1B2r1J41PRYbfbmbm0nwwx ==stTl!:;r2   )r   r   r   NNr   r   r   rE   s   @r1   r   r     s    8M  !%"'' ' 	'
 ' ' sm' ' ' 
'6v & r2   r   c            %            e Zd ZdZg dZddddddddej                  d	d
ddddddfdededee   dee   de	dee	   dedede
def   de	dedee   dededee   dee   ded df$ fd!Zd"ed efd#Z xZS )$MelSpectrograma
  Create MelSpectrogram for a raw audio signal.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    This is a composition of :py:func:`torchaudio.transforms.Spectrogram`
    and :py:func:`torchaudio.transforms.MelScale`.

    Sources
        * https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
        * https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html
        * http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html

    Args:
        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
        win_length (int or None, optional): Window size. (Default: ``n_fft``)
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        f_min (float, optional): Minimum frequency. (Default: ``0.``)
        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
        pad (int, optional): Two sided padding of signal. (Default: ``0``)
        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
        power (float, optional): Exponent for the magnitude spectrogram,
            (must be > 0) e.g., 1 for magnitude, 2 for power, etc. (Default: ``2``)
        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
        center (bool, optional): whether to pad :attr:`waveform` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. (Default: ``"reflect"``)
        onesided: Deprecated and unused.
        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
            (area normalization). (Default: ``None``)
        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.MelSpectrogram(sample_rate)
        >>> mel_specgram = transform(waveform)  # (channel, n_mels, time)

    See also:
        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
        generate the filter banks.
    )rr   r   r   r   r   rq   rs   r   r   Nr   r   r   r   FTr   r   rr   r   r   r   rs   rt   r   rq   r   .r   r   r   r   r    r!   rv   rw   r#   c                    t         t        |           t        j                  j                  d       |t        j                  d       || _        || _	        ||n|| _
        ||n| j                  dz  | _        || _        |
| _        || _        || _        || _        || _        t%        | j                  | j                  | j                  | j                  |	| j                  | j                  |||d      | _        t)        | j                  | j                  | j"                  | j                   | j                  dz  dz   ||      | _        y )Nz$torchaudio.transforms.MelSpectrogramz\Argument 'onesided' has been deprecated and has no influence on the behavior of this module.r%   T)r   r   r   r   r   r   r   r   r   r    r!   rV   )r'   r   r(   r)   r*   r+   r-   r.   rr   r   r   r   r   r   r   rq   rt   rs   r   r6   rp   rw   )r/   rr   r   r   r   rs   rt   r   rq   r   r   r   r   r   r    r!   rv   rw   r0   s                     r1   r(   zMelSpectrogram.__init__,  s!   ( 	nd,.$$%KLMMn '
(2(>*E(2(>*DOOWXDX
$

&****
 "KK))4::tzz4::QR?UVCVX\^g
r2   r3   c                 J    | j                  |      }| j                  |      }|S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
        )r6   rw   )r/   r3   rY   r   s       r1   r8   zMelSpectrogram.forwardc  s(     ##H-~~h/r2   )r9   r:   r;   r<   r=   r)   r>   r?   r   r@   r   r   rA   rC   rB   r(   r8   rD   rE   s   @r1   r   r     s8   /` cM !$($(!%+0+<+< "&!#'"%5
5
 5
 SM	5

 SM5
 5
 5
 5
 5
 CK(5
 5
 5
 $5
 5
 5
  4.!5
" sm#5
$ %5
& 
'5
n
 
6 
r2   r   c                   l     e Zd ZdZg dZ	 	 	 	 	 	 ddededededed	ee	   d
df fdZ
ded
efdZ xZS )MFCCa  Create the Mel-frequency cepstrum coefficients from an audio signal.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
    This is not the textbook implementation, but is implemented here to
    give consistency with librosa.

    This output depends on the maximum value in the input spectrogram, and so
    may return different values for an audio clip split into snippets vs. a
    a full clip.

    Args:
        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
        n_mfcc (int, optional): Number of mfc coefficients to retain. (Default: ``40``)
        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
        norm (str, optional): norm to use. (Default: ``"ortho"``)
        log_mels (bool, optional): whether to use log-mel spectrograms instead of db-scaled. (Default: ``False``)
        melkwargs (dict or None, optional): arguments for MelSpectrogram. (Default: ``None``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.MFCC(
        >>>     sample_rate=sample_rate,
        >>>     n_mfcc=13,
        >>>     melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False},
        >>> )
        >>> mfcc = transform(waveform)

    See also:
        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
        generate the filter banks.
    )rr   n_mfccdct_typerd   log_melsNrr   r   r   rv   r   	melkwargsr#   c                 8   t         t        |           dg}||vrt        dj	                  |            || _        || _        || _        || _        d| _	        t        d| j                        | _        |xs i }t        dd| j
                  i|| _        | j                  | j                  j                  kD  rt        d      t        j                  | j                  | j                  j                  | j                        }| j!                  d|       || _        y )	Nr%   DCT type not supported: {}      T@r   rr   z4Cannot select more MFCC coefficients than # mel binsdct_mat )r'   r   r(   rW   rX   rr   r   r   rv   rd   r^   rm   r   rq   r5   
create_dctr,   r   )
r/   rr   r   r   rv   r   r   supported_dct_typesr   r0   s
            r1   r(   zMFCC.__init__  s     	dD"$ c..9@@JKK& 	,WdkkBO	,W9I9IWYW;;,,333STT,,t{{D,?,?,F,F		RY0 r2   r3   c                    | j                  |      }| j                  rd}t        j                  ||z         }n| j	                  |      }t        j
                  |j                  dd      | j                        j                  dd      }|S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: specgram_mel_db of size (..., ``n_mfcc``, time).
        ư>r}   r~   )r   r   r)   logrm   r   r   r   )r/   r3   r   
log_offsetmfccs        r1   r8   zMFCC.forward  sz     **84==J 99\J%>?L//=L ||L222r:DLLISSTVXZ[r2   )r   (   r%   orthoFN)r9   r:   r;   r<   r=   r?   rB   rA   r   rC   r(   r   r8   rD   rE   s   @r1   r   r   p  s    "F PM !$(!! ! 	!
 ! ! D>! 
!: 6 r2   r   c                        e Zd ZdZg dZ	 	 	 	 	 	 	 	 	 ddedededee   ded	ed
ede	dee
   ddf fdZdedefdZ xZS )LFCCa  Create the linear-frequency cepstrum coefficients from an audio signal.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    By default, this calculates the LFCC on the DB-scaled linear filtered spectrogram.
    This is not the textbook implementation, but is implemented here to
    give consistency with librosa.

    This output depends on the maximum value in the input spectrogram, and so
    may return different values for an audio clip split into snippets vs. a
    a full clip.

    Args:
        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
        n_filter (int, optional): Number of linear filters to apply. (Default: ``128``)
        n_lfcc (int, optional): Number of lfc coefficients to retain. (Default: ``40``)
        f_min (float, optional): Minimum frequency. (Default: ``0.``)
        f_max (float or None, optional): Maximum frequency. (Default: ``None``)
        dct_type (int, optional): type of DCT (discrete cosine transform) to use. (Default: ``2``)
        norm (str, optional): norm to use. (Default: ``"ortho"``)
        log_lf (bool, optional): whether to use log-lf spectrograms instead of db-scaled. (Default: ``False``)
        speckwargs (dict or None, optional): arguments for Spectrogram. (Default: ``None``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.LFCC(
        >>>     sample_rate=sample_rate,
        >>>     n_lfcc=13,
        >>>     speckwargs={"n_fft": 400, "hop_length": 160, "center": False},
        >>> )
        >>> lfcc = transform(waveform)

    See also:
        :py:func:`torchaudio.functional.linear_fbanks` - The function used to
        generate the filter banks.
    )rr   n_filtern_lfccr   rd   log_lfNrr   r   rs   rt   r   r   rv   r   
speckwargsr#   c
                 2   t         t        |           dg}
||
vrt        dj	                  |            || _        || _        ||nt        |dz        | _        || _	        || _
        || _        || _        d| _        t        d| j                        | _        |	xs i }	t!        d
i |	| _        | j                  | j                   j"                  kD  rt        d      t%        j&                  | j                   j"                  dz  dz   | j                  | j                  | j                  | j
                        }| j)                  d|       t%        j*                  | j                  | j                  | j                        }| j)                  d	|       || _        y )Nr%   r   r   r   z4Cannot select more LFCC coefficients than # fft binsrV   )n_freqsrs   rt   r   rr   
filter_matr   r   )r'   r   r(   rW   rX   rr   rs   r@   rt   r   r   r   rv   rd   r^   rm   r   r   r5   linear_fbanksr,   r   r   )r/   rr   r   rs   rt   r   r   rv   r   r   r   r   r   r0   s                r1   r(   zLFCC.__init__  sa    	dD"$ c..9@@JKK&
#/UU;!;K5L
  	,WdkkB%2
&44;;))///STT__$$**a/!3****]]((

 	\:6,,t{{DMM499EY0r2   r3   c                    | j                  |      }t        j                  |j                  dd      | j                        j                  dd      }| j
                  rd}t        j                  ||z         }n| j                  |      }t        j                  |j                  dd      | j                        j                  dd      }|S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Linear Frequency Cepstral Coefficients of size (..., ``n_lfcc``, time).
        r}   r~   r   )	r   r)   r   r   r   r   r   rm   r   )r/   r3   rY   r   lfccs        r1   r8   zLFCC.forward  s     ##H- << 2 22r :DOOLVVWY[]^;;JyyJ!67H++H5H ||H..r26EOOPRTVWr2   )	r   r   r   Nr   r%   r   FN)r9   r:   r;   r<   r=   r?   r@   r   rB   rA   rC   r(   r   r8   rD   rE   s   @r1   r   r     s    %L ZM !!%%)++ + 	+
 + + + + + TN+ 
+Z 6 r2   r   c                   D     e Zd ZdZdgZddeddf fdZdedefdZ xZ	S )	MuLawEncodinga  Encode signal based on mu-law companding.

    .. devices:: CPU CUDA

    .. properties:: TorchScript

    For more info see the
    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_

    This algorithm assumes the signal has been scaled to between -1 and 1 and
    returns a signal encoded with values from 0 to quantization_channels - 1

    Args:
        quantization_channels (int, optional): Number of channels. (Default: ``256``)

    Example
       >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
       >>> transform = torchaudio.transforms.MuLawEncoding(quantization_channels=512)
       >>> mulawtrans = transform(waveform)

    quantization_channelsr#   Nc                 8    t         t        |           || _        y rM   )r'   r   r(   r   r/   r   r0   s     r1   r(   zMuLawEncoding.__init__M      mT+-%:"r2   rk   c                 B    t        j                  || j                        S )z
        Args:
            x (Tensor): A signal to be encoded.

        Returns:
            Tensor: An encoded signal.
        )r5   mu_law_encodingr   rn   s     r1   r8   zMuLawEncoding.forwardQ  s       D$>$>??r2      
r9   r:   r;   r<   r=   r?   r(   r   r8   rD   rE   s   @r1   r   r   5  s;    * --M;c ;D ;@ @F @r2   r   c                   D     e Zd ZdZdgZddeddf fdZdedefdZ xZ	S )	MuLawDecodinga  Decode mu-law encoded signal.

    .. devices:: CPU CUDA

    .. properties:: TorchScript

    For more info see the
    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_

    This expects an input with values between 0 and ``quantization_channels - 1``
    and returns a signal scaled between -1 and 1.

    Args:
        quantization_channels (int, optional): Number of channels. (Default: ``256``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = torchaudio.transforms.MuLawDecoding(quantization_channels=512)
        >>> mulawtrans = transform(waveform)
    r   r#   Nc                 8    t         t        |           || _        y rM   )r'   r   r(   r   r   s     r1   r(   zMuLawDecoding.__init__s  r   r2   x_muc                 B    t        j                  || j                        S )z
        Args:
            x_mu (Tensor): A mu-law encoded signal which needs to be decoded.

        Returns:
            Tensor: The signal decoded.
        )r5   mu_law_decodingr   )r/   r   s     r1   r8   zMuLawDecoding.forwardw  s       t'A'ABBr2   r   r   rE   s   @r1   r   r   \  s;    ( --M;c ;D ;CF Cv Cr2   r   c                        e Zd ZdZ	 	 	 	 	 	 ddddededededed	ee   d
eej                     ddf fdZ
dedefdZ xZS )Resamplea  Resample a signal from one frequency to another. A resampling method can be given.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Note:
        If resampling on waveforms of higher precision than float32, there may be a small loss of precision
        because the kernel is cached once as float32. If high precision resampling is important for your application,
        the functional form will retain higher precision, but run slower because it does not cache the kernel.
        Alternatively, you could rewrite a transform that caches a higher precision kernel.

    Args:
        orig_freq (int, optional): The original frequency of the signal. (Default: ``16000``)
        new_freq (int, optional): The desired frequency. (Default: ``16000``)
        resampling_method (str, optional): The resampling method to use.
            Options: [``sinc_interp_hann``, ``sinc_interp_kaiser``] (Default: ``"sinc_interp_hann"``)
        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
            but less efficient. (Default: ``6``)
        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
        beta (float or None, optional): The shape parameter used for kaiser window.
        dtype (torch.device, optional):
            Determnines the precision that resampling kernel is pre-computed and cached. If not provided,
            kernel is computed with ``torch.float64`` then cached as ``torch.float32``.
            If you need higher precision, provide ``torch.float64``, and the pre-computed kernel is computed and
            cached as ``torch.float64``. If you use resample with lower precision, then instead of providing this
            providing this argument, please use ``Resample.to(dtype)``, so that the kernel generation is still
            carried out on ``torch.float64``.

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.Resample(sample_rate, sample_rate/10)
        >>> waveform = transform(waveform)
    Ndtype	orig_freqnew_freqresampling_methodlowpass_filter_widthrolloffbetar   r#   c          
         t         	|           || _        || _        t	        j
                  t        | j                        t        | j                              | _        || _        || _        || _	        || _
        | j                  | j                  k7  rjt        | j                  | j                  | j
                  | j                  | j                  | j                  ||      \  }| _        | j                  d|       y y )Nr   kernel)r'   r(   r   r   rh   gcdr?   r   r   r   r   r   widthr,   )
r/   r   r   r   r   r   r   r   r   r0   s
            r1   r(   zResample.__init__  s     	" 88C/T]]1CD!2$8!	>>T]]*!:))&&	"FDJ   62 +r2   r3   c                     | j                   | j                  k(  r|S t        || j                   | j                  | j                  | j                  | j
                        S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Output signal of dimension (..., time).
        )r   r   r   r   r   r   r7   s     r1   r8   zResample.forward  sK     >>T]]*O*8T^^T]]TXT\T\^b^i^ikokukuvvr2   )r   r   sinc_interp_hann   rT   N)r9   r:   r;   r<   r?   rB   r@   r   r)   r   r(   r   r8   rD   rE   s   @r1   r   r     s    "L !3$% $ 3 (, 3 3  3 	 3
 " 3  3 uo 3 $ 3 
 3D
w 
w6 
wr2   r   c                   H     e Zd ZdZdgZd	dededdf fdZdedefdZ	 xZ
S )
ComputeDeltasa  Compute delta coefficients of a tensor, usually a spectrogram.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    See `torchaudio.functional.compute_deltas` for more details.

    Args:
        win_length (int, optional): The window length used for computing delta. (Default: ``5``)
        mode (str, optional): Mode parameter passed to padding. (Default: ``"replicate"``)
    r   moder#   Nc                 F    t         t        |           || _        || _        y rM   )r'   r   r(   r   r   )r/   r   r   r0   s      r1   r(   zComputeDeltas.__init__  s    mT+-$	r2   rY   c                 Z    t        j                  || j                  | j                        S )z
        Args:
            specgram (Tensor): Tensor of audio of dimension (..., freq, time).

        Returns:
            Tensor: Tensor of deltas of dimension (..., freq, time).
        )r   r   )r5   compute_deltasr   r   r\   s     r1   r8   zComputeDeltas.forward  s!     T__499UUr2   )   	replicate)r9   r:   r;   r<   r=   r?   rB   r(   r   r8   rD   rE   s   @r1   r   r     sA     "NM3 #  
V V6 Vr2   r   c            	       d     e Zd ZdZdgZddee   dedee   ddf fdZdde	d	ee   de	fd
Z
 xZS )TimeStretcha  Stretch stft in time without modifying pitch for a given rate.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Proposed in *SpecAugment* :cite:`specaugment`.

    Args:
        hop_length (int or None, optional): Length of hop between STFT windows.
            (Default: ``n_fft // 2``, where ``n_fft == (n_freq - 1) * 2``)
        n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
        fixed_rate (float or None, optional): rate to speed up or slow down by.
            If None is provided, rate must be passed to the forward method. (Default: ``None``)

    .. note::

       The expected input is raw, complex-valued spectrogram.

    Example
        >>> spectrogram = torchaudio.transforms.Spectrogram(power=None)
        >>> stretch = torchaudio.transforms.TimeStretch()
        >>>
        >>> original = spectrogram(waveform)
        >>> stretched_1_2 = stretch(original, 1.2)
        >>> stretched_0_9 = stretch(original, 0.9)

        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch.png
           :width: 600
           :alt: The visualization of stretched spectrograms.
    
fixed_rateNr   n_freqr#   c                     t         t        |           || _        |dz
  dz  }||n|dz  }| j	                  dt        j                  dt        j                  |z  |      d          y )NrV   r%   phase_advancer   ).N)	r'   r   r(   r   r,   r)   linspacerh   pi)r/   r   r   r   r   r0   s        r1   r(   zTimeStretch.__init__  sb    k4)+$!q #-#9Zuz
_ennQ*@TV\.]^g.hir2   complex_specgramsoverriding_ratec                     t        j                  |      st        j                  dd       |$| j                  t        d      | j                  }n|}t        j                  ||| j                        S )a2  
        Args:
            complex_specgrams (Tensor):
                A tensor of dimension `(..., freq, num_frame)` with complex dtype.
            overriding_rate (float or None, optional): speed up to apply to this batch.
                If no rate is passed, use ``self.fixed_rate``. (Default: ``None``)

        Returns:
            Tensor:
                Stretched spectrogram. The resulting tensor is of the corresponding complex dtype
                as the input spectrogram, and the number of frames is changed to ``ceil(num_frame / rate)``.
        zeThe input to TimeStretch must be complex type. Providing non-complex tensor produces invalid results.   )
stacklevelzLIf no fixed_rate is specified, must pass a valid rate to the forward method.)	r)   
is_complexr-   r.   r   rW   r5   phase_vocoderr   )r/   r   r   rates       r1   r8   zTimeStretch.forward   sn      12MMI "& !opp??D"D0$8J8JKKr2   )Nr   NrM   )r9   r:   r;   r<   r=   r   r?   r@   r(   r   r8   rD   rE   s   @r1   r   r     sf    > "NMj8C= j jX`afXg jsw jL L(5/ L]c Lr2   r   c            	            e Zd ZdZddedededdf fdZdedefd	Zd
ede	j                  defdZd
ede	j                  defdZ xZS )Fadea  Add a fade in and/or fade out to an waveform.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        fade_in_len (int, optional): Length of fade-in (time frames). (Default: ``0``)
        fade_out_len (int, optional): Length of fade-out (time frames). (Default: ``0``)
        fade_shape (str, optional): Shape of fade. Must be one of: "quarter_sine",
            ``"half_sine"``, ``"linear"``, ``"logarithmic"``, ``"exponential"``.
            (Default: ``"linear"``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.Fade(fade_in_len=sample_rate, fade_out_len=2 * sample_rate, fade_shape="linear")
        >>> faded_waveform = transform(waveform)
    fade_in_lenfade_out_len
fade_shaper#   Nc                 T    t         t        |           || _        || _        || _        y rM   )r'   r   r(   r   r   r   )r/   r   r   r   r0   s       r1   r(   zFade.__init__Q  s'    dD"$&($r2   r3   c                     |j                         d   }|j                  }| j                  ||      | j                  ||      z  |z  S )
        Args:
            waveform (Tensor): Tensor of audio of dimension `(..., time)`.

        Returns:
            Tensor: Tensor of audio of dimension `(..., time)`.
        r}   )r   device_fade_in	_fade_out)r/   r3   waveform_lengthr  s       r1   r8   zFade.forwardW  sE     #--/"-}}_f5X^8__bjjjr2   r  r  c                    t        j                  dd| j                  |      }t        j                  || j                  z
  |      }| j                  dk(  r|}| j                  dk(  rt        j
                  d|dz
        |z  }| j                  dk(  rt        j                  d|z         dz   }| j                  d	k(  r)t        j                  |t        j                  z  dz        }| j                  d
k(  r@t        j                  |t        j                  z  t        j                  dz  z
        dz  dz   }t        j                  ||f      j                  dd      S )Nr   rV   r  linearexponentialr%   logarithmic皙?quarter_sine	half_sine      ?)r)   r   r   onesr   powri   sinrh   r   catclamp_r/   r  r  fader  s        r1   r  zFade._fade_inc  s   ~~aD$4$4VDzz/D,<,<<VL??h&D??m+99Q+d2D??m+;;sTz*Q.D??n,99TDGG^a/0D??k)99TDGG^dggk9:Q>DDyy$&--a33r2   c                    t        j                  dd| j                  |      }t        j                  || j                  z
  |      }| j                  dk(  r| dz   }| j                  dk(  rt        j
                  d|       d|z
  z  }| j                  dk(  rt        j                  d|z
        dz   }| j                  d	k(  r=t        j                  |t        j                  z  dz  t        j                  dz  z         }| j                  d
k(  r@t        j                  |t        j                  z  t        j                  dz  z         dz  dz   }t        j                  ||f      j                  dd      S )Nr   rV   r  r	  r
  r%   r  g?r  r  r  )r)   r   r   r  r   r  ri   r  rh   r   r  r  r  s        r1   r  zFade._fade_outx  s%   ~~aD$5$5fEzz/D,=,==fM??h&519D??m+99Q&!d(3D??m+;;sTz*Q.D??n,99TDGG^a/$''A+=>D??k)99TDGG^dggk9:Q>DDyy$&--a33r2   )r   r   r	  )r9   r:   r;   r<   r?   rB   r(   r   r8   r)   r  r  r  rD   rE   s   @r1   r   r   =  s}    &%C %3 %PS %cg %
k 
k6 
k4 4U\\ 4f 4*4 4ell 4v 4r2   r   c                   X     e Zd ZdZg dZddededededdf
 fd	Zdd
e	dede	fdZ
 xZS )_AxisMaskinga/  Apply masking to a spectrogram.

    Args:
        mask_param (int): Maximum possible length of the mask.
        axis (int): What dimension the mask is applied on (assuming the tensor is 3D).
            For frequency masking, axis = 1.
            For time masking, axis = 2.
        iid_masks (bool): Applies iid masks to each of the examples in the batch dimension.
            This option is applicable only when the dimension of the input tensor is >= 3.
        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
    )
mask_paramaxis	iid_maskspr  r  r  r  r#   Nc                 b    t         t        |           || _        || _        || _        || _        y rM   )r'   r  r(   r  r  r  r  )r/   r  r  r  r  r0   s        r1   r(   z_AxisMasking.__init__  s,    lD*,$	"r2   rY   
mask_valuec                 J   | j                   rLt        j                  || j                  || j                  |j                         z   dz
  | j                        S t        j                  || j                  || j                  |j                         z   dz
  | j                        S )a  
        Args:
            specgram (Tensor): Tensor of dimension `(..., freq, time)`.
            mask_value (float): Value to assign to the masked columns.

        Returns:
            Tensor: Masked spectrogram of dimensions `(..., freq, time)`.
           r  )r  r5   mask_along_axis_iidr  r  dimr  mask_along_axis)r/   rY   r  s      r1   r8   z_AxisMasking.forward  s     >>(($//:tyy8<<>7QTU7UY]Y_Y_  $$Xt
DIIX`XdXdXfLfijLjnrntntuur2   )rg   )r   r9   r:   r;   r<   r=   r?   rA   r@   r(   r   r8   rD   rE   s   @r1   r  r    sV    
 =M3 c d u W[ v vE vF vr2   r  c                   2     e Zd ZdZddededdf fdZ xZS )FrequencyMaskingaH  Apply masking to a spectrogram in the frequency domain.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Proposed in *SpecAugment* :cite:`specaugment`.

    Args:
        freq_mask_param (int): maximum possible length of the mask.
            Indices uniformly sampled from [0, freq_mask_param).
        iid_masks (bool, optional): whether to apply different masks to each
            example/channel in the batch. (Default: ``False``)
            This option is applicable only when the input tensor >= 3D.

    Example
        >>> spectrogram = torchaudio.transforms.Spectrogram()
        >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
        >>>
        >>> original = spectrogram(waveform)
        >>> masked = masking(original)

        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
           :alt: The original spectrogram

        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
           :alt: The spectrogram masked along frequency axis
    freq_mask_paramr  r#   Nc                 0    t         t        |   |d|       y )NrV   )r'   r(  r(   )r/   r)  r  r0   s      r1   r(   zFrequencyMasking.__init__  s    .9Mr2   )F)r9   r:   r;   r<   r?   rA   r(   rD   rE   s   @r1   r(  r(    s,    :N N N N Nr2   r(  c            	       6     e Zd ZdZddedededdf fdZ xZS )	TimeMaskinga  Apply masking to a spectrogram in the time domain.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Proposed in *SpecAugment* :cite:`specaugment`.

    Args:
        time_mask_param (int): maximum possible length of the mask.
            Indices uniformly sampled from [0, time_mask_param).
        iid_masks (bool, optional): whether to apply different masks to each
            example/channel in the batch. (Default: ``False``)
            This option is applicable only when the input tensor >= 3D.
        p (float, optional): maximum proportion of time steps that can be masked.
            Must be within range [0.0, 1.0]. (Default: 1.0)

    Example
        >>> spectrogram = torchaudio.transforms.Spectrogram()
        >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
        >>>
        >>> original = spectrogram(waveform)
        >>> masked = masking(original)

        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
           :alt: The original spectrogram

        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
           :alt: The spectrogram masked along time axis
    time_mask_paramr  r  r#   Nc                 l    d|cxk  rdk  sn t        d| d      t        t        |   |d||       y )Nr   rg   z,The value of p must be between 0.0 and 1.0 (z given).r%   r"  )rW   r'   r,  r(   )r/   r-  r  r  r0   s       r1   r(   zTimeMasking.__init__  s;    a3KA3hWXXk4)/1i1)Mr2   )Frg   )	r9   r:   r;   r<   r?   rA   r@   r(   rD   rE   s   @r1   r,  r,    s5    >N N N NY] N Nr2   r,  c                   d     e Zd ZdZg dZ	 	 	 ddedededededed	ed
df fdZde	d
e	fdZ
 xZS )SpecAugmenta  Apply time and frequency masking to a spectrogram.
    Args:
        n_time_masks (int): Number of time masks. If its value is zero, no time masking will be applied.
        time_mask_param (int): Maximum possible length of the time mask.
        n_freq_masks (int): Number of frequency masks. If its value is zero, no frequency masking will be applied.
        freq_mask_param (int): Maximum possible length of the frequency mask.
        iid_masks (bool, optional): Applies iid masks to each of the examples in the batch dimension.
            This option is applicable only when the input tensor is 4D. (Default: ``True``)
        p (float, optional): maximum proportion of time steps that can be masked.
            Must be within range [0.0, 1.0]. (Default: 1.0)
        zero_masking (bool, optional): If ``True``, use 0 as the mask value,
            else use mean of the input tensor. (Default: ``False``)
    )n_time_masksr-  n_freq_masksr)  r  r  zero_maskingr1  r-  r2  r)  r  r  r3  r#   Nc                     t         t        |           || _        || _        || _        || _        || _        || _        || _	        y rM   )
r'   r0  r(   r1  r-  r2  r)  r  r  r3  )	r/   r1  r-  r2  r)  r  r  r3  r0   s	           r1   r(   zSpecAugment.__init__  sH     	k4)+(.(."(r2   rY   c                    | j                   rd}n|j                         }|j                         dz
  }|dz
  }|j                         dkD  r| j                  du rt	        | j
                        D ]0  }t        j                  || j                  ||| j                        }2 t	        | j                        D ]0  }t        j                  || j                  ||| j                        }2 |S t	        | j
                        D ]0  }t        j                  || j                  ||| j                        }2 t	        | j                        D ]0  }t        j                  || j                  ||| j                        }2 |S )z
        Args:
            specgram (Tensor): Tensor of shape `(..., freq, time)`.
        Returns:
            Tensor: Masked spectrogram of shape `(..., freq, time)`.
        r   rV   r%   Tr"  )r3  meanr$  r  ranger1  r5   r#  r-  r  r2  r)  r%  )r/   rY   r  time_dimfreq_dimr   s         r1   r8   zSpecAugment.forward*  sM    J!J<<>A%a<<<>A$..D"84,,-004;O;OQ[]eimioiop .4,,-004;O;OQ[]eimioiop .  4,,-,,Xt7K7KZYaeiekekl .4,,-,,Xt7K7KZYaeiekekl . r2   )Trg   Fr&  rE   s   @r1   r0  r0    s    M  ")) ) 	)
 ) ) ) ) 
)& 6 r2   r0  c                   :     e Zd ZdZdgZdef fdZdefdZ xZ	S )Loudnessa  Measure audio loudness according to the ITU-R BS.1770-4 recommendation.

    .. devices:: CPU CUDA

    .. properties:: TorchScript

    Args:
        sample_rate (int): Sample rate of audio signal.

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.Loudness(sample_rate)
        >>> loudness = transform(waveform)

    Reference:
        - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
    rr   c                 8    t         t        |           || _        y rM   )r'   r;  r(   rr   )r/   rr   r0   s     r1   r(   zLoudness.__init__Z  s    h&(&r2   wavefromc                 B    t        j                  || j                        S )z
        Args:
            waveform(torch.Tensor): audio waveform of dimension `(..., channels, time)`

        Returns:
            Tensor: loudness estimates (LKFS)
        )r5   loudnessrr   )r/   r=  s     r1   r8   zLoudness.forward^  s     zz(D$4$455r2   r   rE   s   @r1   r;  r;  F  s)    " #OM'C '6 6r2   r;  c                   >     e Zd ZdZddedef fdZdedefdZ xZ	S )	Vola  Adjust volume of waveform.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        gain (float): Interpreted according to the given gain_type:
            If ``gain_type`` = ``amplitude``, ``gain`` is a positive amplitude ratio.
            If ``gain_type`` = ``power``, ``gain`` is a power (voltage squared).
            If ``gain_type`` = ``db``, ``gain`` is in decibels.
        gain_type (str, optional): Type of gain. One of: ``amplitude``, ``power``, ``db`` (Default: ``amplitude``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.Vol(gain=0.5, gain_type="amplitude")
        >>> quieter_waveform = transform(waveform)
    gain	gain_typec                 p    t         t        |           || _        || _        |dv r|dk  rt        d      y y )N)	amplituder   r   z9If gain_type = amplitude or power, gain must be positive.)r'   rA  r(   rB  rC  rW   )r/   rB  rC  r0   s      r1   r(   zVol.__init__}  sA    c4!#	"..4!8XYY 4<.r2   r3   r#   c                 T   | j                   dk(  r|| j                  z  }| j                   dk(  r t        j                  || j                        }| j                   dk(  r6t        j                  |dt        j                  | j                        z        }t        j                  |dd      S )r  rE  dbr   
   r}   rV   )rC  rB  r5   rh   ri   r)   clampr7   s     r1   r8   zVol.forward  s     >>[($))+H>>T!vvh		2H>>W$vvhTZZ		-B(BCH{{8R++r2   )rE  )
r9   r:   r;   r<   r@   rB   r(   r   r8   rD   rE   s   @r1   rA  rA  i  s1    &ZU Zs Z, ,6 ,r2   rA  c                   L     e Zd ZdZ	 ddededededdf
 fdZd	edefd
Z xZ	S )SlidingWindowCmna  
    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
            Only applicable if center == false, ignored if center==true (int, default = 100)
        center (bool, optional): If true, use a window centered on the current frame
            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.SlidingWindowCmn(cmn_window=1000)
        >>> cmn_waveform = transform(waveform)
    
cmn_windowmin_cmn_windowr   	norm_varsr#   Nc                 Z    t         |           || _        || _        || _        || _        y rM   )r'   r(   rL  rM  r   rN  )r/   rL  rM  r   rN  r0   s        r1   r(   zSlidingWindowCmn.__init__  s-     	$,"r2   rY   c                     t        j                  || j                  | j                  | j                  | j
                        }|S )z
        Args:
            specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`.

        Returns:
            Tensor: Tensor of spectrogram of dimension `(..., time, freq)`.
        )r5   sliding_window_cmnrL  rM  r   rN  )r/   rY   cmn_specgrams      r1   r8   zSlidingWindowCmn.forward  s:     ++HdootGZGZ\`\g\gimiwiwxr2   )iX  d   FF)
r9   r:   r;   r<   r?   rA   r(   r   r8   rD   rE   s   @r1   rK  rK    sO    , in##58#HL#ae#	#	 	6 	r2   rK  c            %            e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededed	ed
ededededee   dedededededdf$ fdZdedefdZ	 xZ
S )Vadu  Voice Activity Detector. Similar to SoX implementation.

    .. devices:: CPU CUDA

    .. properties:: TorchScript

    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
    The algorithm currently uses a simple cepstral power measurement to detect voice,
    so may be fooled by other things, especially music.

    The effect can trim only from the front of the audio,
    so in order to trim from the back, the reverse effect must also be used.

    Args:
        sample_rate (int): Sample rate of audio signal.
        trigger_level (float, optional): The measurement level used to trigger activity detection.
            This may need to be changed depending on the noise level, signal level,
            and other characteristics of the input audio. (Default: 7.0)
        trigger_time (float, optional): The time constant (in seconds)
            used to help ignore short bursts of sound. (Default: 0.25)
        search_time (float, optional): The amount of audio (in seconds)
            to search for quieter/shorter bursts of audio to include prior
            to the detected trigger point. (Default: 1.0)
        allowed_gap (float, optional): The allowed gap (in seconds) between
            quiteter/shorter bursts of audio to include prior
            to the detected trigger point. (Default: 0.25)
        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
        boot_time (float, optional) The algorithm (internally) uses adaptive noise
            estimation/reduction in order to detect the start of the wanted audio.
            This option sets the time for the initial noise estimate. (Default: 0.35)
        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
            for when the noise level is increasing. (Default: 0.1)
        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
            for when the noise level is decreasing. (Default: 0.01)
        noise_reduction_amount (float, optional) Amount of noise reduction to use in
            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
        measure_freq (float, optional) Frequency of the algorithm’s
            processing/measurements. (Default: 20.0)
        measure_duration: (float or None, optional) Measurement duration.
            (Default: Twice the measurement period; i.e. with overlap.)
        measure_smooth_time (float, optional) Time constant used to smooth
            spectral measurements. (Default: 0.4)
        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
            at the input to the detector algorithm. (Default: 50.0)
        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
            at the input to the detector algorithm. (Default: 6000.0)
        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
            in the detector algorithm. (Default: 150.0)
        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
            in the detector algorithm. (Default: 2000.0)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> waveform_reversed, sample_rate = apply_effects_tensor(waveform, sample_rate, [["reverse"]])
        >>> transform = transforms.Vad(sample_rate=sample_rate, trigger_level=7.5)
        >>> waveform_reversed_front_trim = transform(waveform_reversed)
        >>> waveform_end_trim, sample_rate = apply_effects_tensor(
        >>>     waveform_reversed_front_trim, sample_rate, [["reverse"]]
        >>> )

    Reference:
        - http://sox.sourceforge.net/sox.html
    Nrr   trigger_leveltrigger_timesearch_timeallowed_gappre_trigger_time	boot_timenoise_up_timenoise_down_timenoise_reduction_amountmeasure_freqmeasure_durationmeasure_smooth_timehp_filter_freqlp_filter_freqhp_lifter_freqlp_lifter_freqr#   c                    t         |           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        y rM   )r'   r(   rr   rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  )r/   rr   rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  r0   s                     r1   r(   zVad.__init__  s    ( 	&*(&& 0"*.&<#( 0#6 ,,,,r2   r3   c                    t        j                  di d|d| j                  d| j                  d| j                  d| j
                  d| j                  d| j                  d| j                  d	| j                  d
| j                  d| j                  d| j                  d| j                  d| j                  d| j                  d| j                   d| j"                  d| j$                  S )aW  
        Args:
            waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
                Tensor of shape `(channels, time)` is treated as a multi-channel recording
                of the same event and the resulting output will be trimmed to the earliest
                voice activity in any channel.
        r3   rr   rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  r   )r5   vadrr   rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  r7   s     r1   r8   zVad.forward.  s)    uu 

((
 ,,
 **	

 ((
 ((
 "22
 nn
 ,,
 !00
 $(#>#>
 **
 "22
 !% 8 8
  ..
   ..!
"  ..#
$  ..%
 	
r2   )g      @      ?rg   ri  r   gffffff?r  g{Gz?g?rf   Ng?g      I@g     p@g     b@g     @@)r9   r:   r;   r<   r?   r@   r   r(   r   r8   rD   rE   s   @r1   rU  rU    s
   ?H  #" !"%"!%(,",0%( $ & % &%&-&- &- 	&-
 &- &-  &- &- &- &- !&&- &- #5/&- #&- &-  !&-" #&-$ %&-& 
'&-P
 
6 
r2   rU  c                        e Zd ZdZg dZddddej                  dfdededee   d	ee   d
ede	de
f   dee   ddf fdZde
de
fdZ xZS )SpectralCentroida  Compute the spectral centroid for each channel along the time axis.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    The spectral centroid is defined as the weighted average of the
    frequency values, weighted by their magnitude.

    Args:
        sample_rate (int): Sample rate of audio signal.
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
        win_length (int or None, optional): Window size. (Default: ``n_fft``)
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        pad (int, optional): Two sided padding of signal. (Default: ``0``)
        window_fn (Callable[..., Tensor], optional): A function to create a window tensor
            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
        wkwargs (dict or None, optional): Arguments for window function. (Default: ``None``)

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.SpectralCentroid(sample_rate)
        >>> spectral_centroid = transform(waveform)  # (channel, time)
    )rr   r   r   r   r   r   Nr   rr   r   r   r   r   r   .r   r#   c                    t         t        |           || _        || _        ||n|| _        ||n| j
                  dz  | _        | || j
                        n || j
                  fi |}| j                  d|       || _        y rI   )	r'   rk  r(   rr   r   r   r   r,   r   )
r/   rr   r   r   r   r   r   r   r&   r0   s
            r1   r(   zSpectralCentroid.__init__g  s     	.0&
(2(>*E(2(>*DOOWXDX/64??+IdooDiahDiXv.r2   r3   c           	          t        j                  || j                  | j                  | j                  | j
                  | j                  | j                        S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension `(..., time)`.

        Returns:
            Tensor: Spectral Centroid of size `(..., time)`.
        )r5   spectral_centroidrr   r   r&   r   r   r   r7   s     r1   r8   zSpectralCentroid.forwardz  sC     ""d&&$++tzz4??\`\k\k
 	
r2   )r9   r:   r;   r<   r=   r)   r>   r?   r   r   r   rC   r(   r8   rD   rE   s   @r1   rk  rk  L  s    0 PM
 $($(+0+<+<"&  SM	
 SM  CK( $ 
&
 
6 
r2   rk  c                        e Zd ZU dZg dZeed<   eed<   ddddej                  dfded	ed
edede
e   de
e   dedef   de
e   ddf fdZd ZdedefdZ xZS )
PitchShifta  Shift the pitch of a waveform by ``n_steps`` steps.

    .. devices:: CPU CUDA

    .. properties:: TorchScript

    Args:
        waveform (Tensor): The input waveform of shape `(..., time)`.
        sample_rate (int): Sample rate of `waveform`.
        n_steps (int): The (fractional) steps to shift `waveform`.
        bins_per_octave (int, optional): The number of steps per octave (Default : ``12``).
        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
        hop_length (int or None, optional): Length of hop between STFT windows. If None, then ``win_length // 4``
            is used (Default: ``None``).
        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).

    Example
        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
        >>> transform = transforms.PitchShift(sample_rate, 4)
        >>> waveform_shift = transform(waveform)  # (channel, time)
    )rr   n_stepsbins_per_octaver   r   r   r   r      i   Nrr   rq  rr  r   r   r   r   .r   r#   c	                    t         |           || _        || _        || _        || _        ||n|| _        ||n| j                  dz  | _        | || j                        n || j                  fi |}	| j                  d|	       dt        |       |z  z  }
t        ||
z        | _        t        j                  t        | j                        t        |            | _        | j                  |k7  rd| _        t        d d       | _        y y )Nr   r&   r   r}   )r  r   )r'   r(   rq  rr  rr   r   r   r   r,   r@   r?   r   rh   r   r   r
   r   )r/   rr   rq  rr  r   r   r   r   r   r&   r   r0   s              r1   r(   zPitchShift.__init__  s     	.&
(2(>*E(2(>*DOOWXDX/64??+IdooDiahDiXv.g89[4/088C/[1AB>>[(DJ0DIDK )r2   c                    | j                         r| j                  | j                  k7  rt        j                         5  t        | j                  | j                  | j                  |j                  |j                        \  }| _	        | j                  j                  |j                         | j                  j                  |       d d d        y y y # 1 sw Y   y xY w)N)r   r  )has_uninitialized_paramsr   rr   r)   no_gradr   r   r   r  r   r   materializer   copy_)r/   inputr   s      r1   initialize_parametersz PitchShift.initialize_parameters  s    ((*~~!1!11]]_)B((#kk$||*&FDJ KK++FLL9KK%%f- %_ 2 +$_s   BCCr3   c           	         |j                         }t        || j                  | j                  | j                  | j
                  | j                  | j                        }| j                  | j                  k7  rCt        || j                  | j                  | j                  | j                  | j                        }n|}t        ||      S )z
        Args:
            waveform (Tensor): Tensor of audio of dimension `(..., time)`.

        Returns:
            Tensor: The pitch-shifted audio of shape `(..., time)`.
        )r   r   rq  rr  r   r   r   r&   r   rr   r   r   r   r   r   )r/   r3   r   waveform_stretchwaveform_shifts        r1   r8   zPitchShift.forward  s     ,LL  JJOOOOKK
 >>T---8   

N .N"
 	
r2   )r9   r:   r;   r<   r=   r
   __annotations__r?   r)   r>   r   r   r   rC   r(   r{  r8   rD   rE   s   @r1   rp  rp    s    . gM""J  "$($(+0+<+<"&JJ J 	J
 J SMJ SMJ CK(J $J 
J8.#
 #
6 #
r2   rp  c            	       V     e Zd ZdZ	 	 	 	 ddedededef fdZde	de	d	e	d
e	fdZ
 xZS )RNNTLossa  Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
    :cite:`graves2012sequence`.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    The RNN Transducer loss extends the CTC loss by defining a distribution over output
    sequences of all lengths, and by jointly modelling both input-output and output-output
    dependencies.

    Args:
        blank (int, optional): blank label (Default: ``-1``)
        clamp (float, optional): clamp for gradients (Default: ``-1``)
        reduction (string, optional): Specifies the reduction to apply to the output:
            ``"none"`` | ``"mean"`` | ``"sum"``. (Default: ``"mean"``)
        fused_log_softmax (bool): set to False if calling log_softmax outside of loss (Default: ``True``)

    Example
        >>> # Hypothetical values
        >>> logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
        >>>                          [0.1, 0.1, 0.6, 0.1, 0.1],
        >>>                          [0.1, 0.1, 0.2, 0.8, 0.1]],
        >>>                         [[0.1, 0.6, 0.1, 0.1, 0.1],
        >>>                          [0.1, 0.1, 0.2, 0.1, 0.1],
        >>>                          [0.7, 0.1, 0.2, 0.1, 0.1]]]],
        >>>                       dtype=torch.float32,
        >>>                       requires_grad=True)
        >>> targets = torch.tensor([[1, 2]], dtype=torch.int)
        >>> logit_lengths = torch.tensor([2], dtype=torch.int)
        >>> target_lengths = torch.tensor([2], dtype=torch.int)
        >>> transform = transforms.RNNTLoss(blank=0)
        >>> loss = transform(logits, targets, logit_lengths, target_lengths)
        >>> loss.backward()
    blankrI  	reductionfused_log_softmaxc                 Z    t         |           || _        || _        || _        || _        y rM   )r'   r(   r  rI  r  r  )r/   r  rI  r  r  r0   s        r1   r(   zRNNTLoss.__init__  s-     	

"!2r2   logitstargetslogit_lengthstarget_lengthsc           
          t        j                  ||||| j                  | j                  | j                  | j
                        S )a  
        Args:
            logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
                containing output from joiner
            targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
            logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
            target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
        Returns:
            Tensor: Loss with the reduction option applied. If ``reduction`` is  ``"none"``, then size (batch),
            otherwise scalar.
        )r5   	rnnt_lossr  rI  r  r  )r/   r  r  r  r  s        r1   r8   zRNNTLoss.forward'  s?    $ {{JJJJNN""	
 		
r2   )r}   g      r6  T)r9   r:   r;   r<   r?   r@   rB   rA   r(   r   r8   rD   rE   s   @r1   r  r    sm    "L "&33 3 	3
  3

 
 	

 
r2   r  c                   ~     e Zd ZdZd	deddf fdZdej                  dej                  dej                  fdZ xZ	S )
Convolvea  
    Convolves inputs along their last dimension using the direct method.
    Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
    operator, this module applies the true `convolution`_ operator.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        mode (str, optional): Must be one of ("full", "valid", "same").

            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
              `N` and `M` are the trailing dimensions of the two inputs. (Default)
            * "valid": Returns the segment of the full convolution result corresponding to where
              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.

    .. _convolution:
        https://en.wikipedia.org/wiki/Convolution
    r   r#   Nc                 F    t        |       t        | 	          || _        y rM   r   r'   r(   r   r/   r   r0   s     r1   r(   zConvolve.__init__\      T"	r2   rk   yc                 F    t        j                  ||| j                        S a  
        Args:
            x (torch.Tensor): First convolution operand, with shape `(..., N)`.
            y (torch.Tensor): Second convolution operand, with shape `(..., M)`
                (leading dimensions must be broadcast-able with those of ``x``).

        Returns:
            torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
            the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
        )r   )r5   convolver   r/   rk   r  s      r1   r8   zConvolve.forwardb  s     zz!QTYY//r2   full
r9   r:   r;   r<   rB   r(   r)   r   r8   rD   rE   s   @r1   r  r  E  sA    ,S d 0 0%,, 05<< 0r2   r  c                   ~     e Zd ZdZd	deddf fdZdej                  dej                  dej                  fdZ xZ	S )
FFTConvolvea|  
    Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this module
    is generally much faster than :class:`Convolve`.
    Note that, in contrast to :class:`torch.nn.Conv1d`, which actually applies the valid cross-correlation
    operator, this module applies the true `convolution`_ operator.
    Also note that this module can only output float tensors (int tensor inputs will be cast to float).

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        mode (str, optional): Must be one of ("full", "valid", "same").

            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`, where
              `N` and `M` are the trailing dimensions of the two inputs. (Default)
            * "valid": Returns the segment of the full convolution result corresponding to where
              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.

    .. _convolution:
        https://en.wikipedia.org/wiki/Convolution
    r   r#   Nc                 F    t        |       t        | 	          || _        y rM   r  r  s     r1   r(   zFFTConvolve.__init__  r  r2   rk   r  c                 F    t        j                  ||| j                        S r  )r5   fftconvolver   r  s      r1   r8   zFFTConvolve.forward  s     }}Q		22r2   r  r  rE   s   @r1   r  r  p  sA    0S d 3 3%,, 35<< 3r2   r  r   speedr#   c                 t    t        || z        }t        |       }t        j                  ||      }||z  ||z  fS rM   )r?   rh   r   )r   r  source_sample_ratetarget_sample_rater   s        r1   _source_target_sample_rater    sD    UY./Y
((%'9
:C$&8C&???r2   c                        e Zd ZdZd fdZddeej                     deej                  eej                     f   fdZ	 xZ
S )Speedax  Adjusts waveform speed.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        orig_freq (int): Original frequency of the signals in ``waveform``.
        factor (float): Factor by which to adjust speed of input. Values greater than 1.0
            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
    r#   c                     t         |           || _        || _        t	        ||      \  | _        | _        t        | j
                  | j                        | _        y )N)r   r   )	r'   r(   r   factorr  r  r  r   	resampler)r/   r   r  r0   s      r1   r(   zSpeed.__init__  sO    ";UV_ag;h8!8!D,C,CdNeNefr2   lengthsc                     |d}nHt        j                  || j                  z  | j                  z        j	                  |j
                        }| j                  |      |fS )  
        Args:
            waveform (torch.Tensor): Input signals, with shape `(..., time)`.
            lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
                If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)

        Returns:
            (torch.Tensor, torch.Tensor or None):
                torch.Tensor
                    Speed-adjusted waveform, with shape `(..., new_time).`
                torch.Tensor or None
                    If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
                    with shape `(...)`; otherwise, ``None``.
        N)r)   ceilr  r  tor   r  )r/   r3   r  out_lengthss       r1   r8   zSpeed.forward  sX      ?K**Wt/F/F%FI`I`%`addelerersK~~h'44r2   )r#   NrM   )r9   r:   r;   r<   r(   r   r)   r   r   r8   rD   rE   s   @r1   r  r    sH    
g5%,,)? 55QVQ]Q]_ghmhtht_uQuKv 5r2   r  c            
            e Zd ZdZdedee   ddf fdZ	 d
dej                  de
ej                     deej                  e
ej                     f   fd	Z xZS )SpeedPerturbationa  Applies the speed perturbation augmentation introduced in
    *Audio augmentation for speech recognition* :cite:`ko15_interspeech`. For a given input,
    the module samples a speed-up factor from ``factors`` uniformly at random and adjusts
    the speed of the input by that factor.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        orig_freq (int): Original frequency of the signals in ``waveform``.
        factors (Sequence[float]): Factors by which to adjust speed of input. Values greater than 1.0
            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.

    Example
        >>> speed_perturb = SpeedPerturbation(16000, [0.9, 1.1, 1.0, 1.0, 1.0])
        >>> # waveform speed will be adjusted by factor 0.9 with 20% probability,
        >>> # 1.1 with 20% probability, and 1.0 (i.e. kept the same) with 60% probability.
        >>> speed_perturbed_waveform = speed_perturb(waveform, lengths)
    r   factorsr#   Nc           	          t         |           t        j                  j	                  |D cg c]  }t        ||       c}      | _        y c c}w )N)r   r  )r'   r(   r)   nn
ModuleListr  speeders)r/   r   r  r  r0   s       r1   r(   zSpeedPerturbation.__init__  sA    ++el,mel[aUYv-Vel,mn,ms   Ar3   r  c                     t        t        j                  t        | j                        d            }t        | j                        D ]  \  }}||k(  s |||      c S  t        d      )r  r   z<Speeder not found; execution should have never reached here.)r?   r)   randintlenr  	enumerateRuntimeError)r/   r3   r  idxspeeder_idxspeeders         r1   r8   zSpeedPerturbation.forward  s_    $ %--DMM 2B78 %.dmm$< Kk!x11 %= YZZr2   rM   )r9   r:   r;   r<   r?   r   r@   r(   r)   r   r   r   r8   rD   rE   s   @r1   r  r    sv    *o# o oD o IM[[/7/E[	u||Xell33	4[r2   r  c                       e Zd ZdZ	 d	dej
                  dej
                  dej
                  deej
                     dej
                  f
dZy)
AddNoisezScales and adds noise to waveform per signal-to-noise ratio.
    See :meth:`torchaudio.functional.add_noise` for more details.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript
    Nr3   noisesnrr  r#   c                 2    t        j                  ||||      S )a  
        Args:
            waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
            noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
            snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
            lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform`` and ``noise``,
            with shape `(...,)` (leading dimensions must match those of ``waveform``). If ``None``, all
            elements in ``waveform`` and ``noise`` are treated as valid. (Default: ``None``)

        Returns:
            torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
            (same shape as ``waveform``).
        )r5   	add_noise)r/   r3   r  r  r  s        r1   r8   zAddNoise.forward  s      {{8UC99r2   rM   )r9   r:   r;   r<   r)   r   r   r8   r   r2   r1   r  r    sW     qu::-2\\:@E:W_`e`l`lWm:	:r2   r  c                   f     e Zd ZdZddeddf fdZdej                  dej                  fdZ xZ	S )	PreemphasisaB  Pre-emphasizes a waveform along its last dimension.
    See :meth:`torchaudio.functional.preemphasis` for more details.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        coeff (float, optional): Pre-emphasis coefficient. Typically between 0.0 and 1.0.
            (Default: 0.97)
    coeffr#   Nc                 0    t         |           || _        y rM   r'   r(   r  r/   r  r0   s     r1   r(   zPreemphasis.__init__1      
r2   r3   c                 D    t        j                  || j                        S )z
        Args:
            waveform (torch.Tensor): Waveform, with shape `(..., N)`.

        Returns:
            torch.Tensor: Pre-emphasized waveform, with shape `(..., N)`.
        r  )r5   preemphasisr  r7   s     r1   r8   zPreemphasis.forward5  s     }}XTZZ88r2   g
ףp=
?
r9   r:   r;   r<   r@   r(   r)   r   r8   rD   rE   s   @r1   r  r  $  s6    
e t 9 9 9r2   r  c                   f     e Zd ZdZddeddf fdZdej                  dej                  fdZ xZ	S )	
Deemphasisa?  De-emphasizes a waveform along its last dimension.
    See :meth:`torchaudio.functional.deemphasis` for more details.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        coeff (float, optional): De-emphasis coefficient. Typically between 0.0 and 1.0.
            (Default: 0.97)
    r  r#   Nc                 0    t         |           || _        y rM   r  r  s     r1   r(   zDeemphasis.__init__M  r  r2   r3   c                 D    t        j                  || j                        S )z
        Args:
            waveform (torch.Tensor): Waveform, with shape `(..., N)`.

        Returns:
            torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
        r  )r5   
deemphasisr  r7   s     r1   r8   zDeemphasis.forwardQ  s     ||HDJJ77r2   r  r  rE   s   @r1   r  r  @  s6    
e t 8 8 8r2   r  )>rh   r-   typingr   r   r   r   r   r)   r   torch.nn.modules.lazyr	   torch.nn.parameterr
   
torchaudior   r5    torchaudio.functional.functionalr   r   r   r   r   __all__r  Moduler   rG   rO   r^   rp   r   r   r   r   r   r   r   r   r   r   r  r(  r,  r0  r;  rA  rK  rU  rk  rp  r  r  r  r?   r@   r  r  r  r  r  r  r   r2   r1   <module>r     s.     = =   1 5 &  b
%((// b
JS
 S
lU
 U
p.aEHHOO .abBuxx BJUehhoo UptUXX__ tnT588?? Tnk588?? k\$@EHHOO $@N#CEHHOO #CLQwuxx QwhVEHHOO V>EL%((// ELPN4588?? N4b&v588?? &vRN| ND#N, #NLD%((// DN 6uxx  6F-,%((// -,`(uxx (VE
%((// E
P9
uxx 9
xj
%((// j
ZM
uxx M
`(0uxx (0V*3%((// *3Z@# @e @c3h @+5EHHOO +5\3[ 3[l:uxx :89%((// 988 8r2   