
    PhX                        d dl Z d dlmZmZ d dlZd dlmZ d dlmZ g Z		 	 	 	 ddej
                  dej
                  dej
                  de
d	ed
ededej
                  fdZ G d dej                  j                        Z G d dej                  j                        Z G d dej                  j                        Z G d dej                  j                        Zy)    N)OptionalUnion)Tensor)
functionalpsd_spsd_nreference_vectorsolutiondiagonal_loadingdiag_epsepsreturnc                     |dk(  rt        j                  | |||||      }|S |dk(  rt        j                  |       }nt        j                  | ||||      }t        j                  ||||||      }|S )a  Compute the MVDR beamforming weights with ``solution`` argument.

    Args:
        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
            Tensor with dimensions `(..., freq, channel, channel)`.
        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
            Tensor with dimensions `(..., freq, channel, channel)`.
        reference_vector (torch.Tensor): one-hot reference channel matrix.
        solution (str, optional): Solution to compute the MVDR beamforming weights.
            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
            (Default: ``True``)
        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
            (Default: ``1e-8``)

    Returns:
        torch.Tensor: the mvdr beamforming weight matrix
    ref_channelstv_evd)r   r   )Fmvdr_weights_soudenrtf_evd	rtf_powermvdr_weights_rtf)	r   r   r	   r
   r   r   r   beamform_vectorstvs	            oC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torchaudio/transforms/_multi_channel.py_get_mvdr_vectorr      s    : = //u>NP`bjlop  y ))E"C++eU,<O_jrsC,,S%9IK[]egjk    c                   r     e Zd ZdZd	dededef fdZd
dej                  de	ej                     fdZ
 xZS )PSDa  Compute cross-channel power spectral density (PSD) matrix.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Args:
        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
    
multi_mask	normalizer   c                 L    t         |           || _        || _        || _        y N)super__init__r   r   r   )selfr   r   r   	__class__s       r   r#   zPSD.__init__D   s#    $"r   specgrammaskc                     || j                   r|j                  d      }t        j                  ||| j                  | j
                        }|S )a  
        Args:
            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
                Tensor with dimensions `(..., channel, freq, time)`.
            mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` or
                with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
                (Default: ``None``)

        Returns:
            torch.Tensor: The complex-valued PSD matrix of the input spectrum.
                Tensor with dimensions `(..., freq, channel, channel)`
        dim)r   meanr   psdr   r   )r$   r&   r'   r-   s       r   forwardzPSD.forwardJ   sB     yyRy(eeHdDNNDHH=
r   )FTgV瞯<r!   )__name__
__module____qualname____doc__boolfloatr#   torchr   r   r.   __classcell__r%   s   @r   r   r   7   sB    
4 D e  HU\\4J r   r   c                       e Zd ZdZ	 	 	 	 	 	 ddedededededef fdZ	 	 	 	 dd	e	j                  d
e	j                  de	j                  de	j                  de	j                  dedededede	j                  fdZd	e	j                  de	j                  de	j                  fdZd
e	j                  de	j                  de	j                  fdZ	 dde	j                  de	j                  dee	j                     de	j                  fdZ xZS )MVDRa  Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py

    We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
    :cite:`souden2009optimal` (``solution=ref_channel``).

    .. math::
        \textbf{w}_{\text{MVDR}}(f) =        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}

    where :math:`\bf{\Phi}_{\textbf{SS}}` and :math:`\bf{\Phi}_{\textbf{NN}}` are the covariance        matrices of speech and noise, respectively. :math:`\bf{u}` is an one-hot vector to determine the         reference channel.

    The other two solutions are based on the steering vector (``solution=stv_evd`` or ``solution=stv_power``).

    .. math::
        \textbf{w}_{\text{MVDR}}(f) =        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}

    where :math:`\bm{v}` is the acoustic transfer function or the steering vector.        :math:`.^{\mathsf{H}}` denotes the Hermitian Conjugate operation.

    We apply either *eigenvalue decomposition*
    :cite:`higuchi2016robust` or the *power method* :cite:`mises1929praktische` to get the
    steering vector from the PSD matrix of speech.

    After estimating the beamforming weight, the enhanced Short-time Fourier Transform (STFT) is obtained by

    .. math::
        \hat{\bf{S}} = {\bf{w}^\mathsf{H}}{\bf{Y}}, {\bf{w}} \in \mathbb{C}^{M \times F}

    where :math:`\bf{Y}` and :math:`\hat{\bf{S}}` are the STFT of the multi-channel noisy speech and        the single-channel enhanced speech, respectively.

    For online streaming audio, we provide a *recursive method* :cite:`higuchi2017online` to update the
    PSD matrices of speech and noise, respectively.

    Args:
        ref_channel (int, optional): Reference channel for beamforming. (Default: ``0``)
        solution (str, optional): Solution to compute the MVDR beamforming weights.
            Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
        multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to the covariance matrix
            of the noise. (Default: ``True``)
        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
        online (bool, optional): If ``True``, updates the MVDR beamforming weights based on
            the previous covarience matrices. (Default: ``False``)

    Note:
        To improve the numerical stability, the input spectrogram will be converted to double precision
        (``torch.complex128`` or ``torch.cdouble``) dtype for internal computation. The output spectrogram
        is converted to the dtype of the input spectrogram to be compatible with other modules.

    Note:
        If you use ``stv_evd`` solution, the gradient of the same input may not be identical if the
        eigenvalues of the PSD matrix are not distinct (i.e. some eigenvalues are close or identical).
    r   r
   r   diag_loadingr   onlinec                 
   t         |           |dvrt        dj                  |            || _        || _        || _        || _        || _        || _	        t        |      | _        t        j                  d      }t        j                  d      }t        j                  d      }	t        j                  d      }
| j                  d|       | j                  d|       | j                  d|	       | j                  d|
       y )N)r   r   	stv_powerzK`solution` must be one of ["ref_channel", "stv_evd", "stv_power"]. Given {}   r   r   
mask_sum_s
mask_sum_n)r"   r#   
ValueErrorformatr   r
   r   r:   r   r;   r   r-   r5   zerosregister_buffer)r$   r   r
   r   r:   r   r;   r   r   r?   r@   r%   s              r   r#   zMVDR.__init__   s     	 
 

 ]ddemn  ' $( z?#kk!n#kk!n#(;;q>
#(;;q>
We,We,\:6\:6r   r   r   mask_smask_nr	   r   r   r   c
           	      D   | j                   r$|j                  d      }|j                  d      }| j                  j                  dk(  rM|| _        || _        |j                  d      | _        |j                  d      | _        t        |||||||	      S | j                  ||      }| j                  ||      }|| _        || _        | j                  |j                  d      z   | _        | j                  |j                  d      z   | _        t        |||||||	      S )a  Recursively update the MVDR beamforming vector.

        Args:
            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
                Tensor with dimensions `(..., freq, channel, channel)`.
            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
                Tensor with dimensions `(..., freq, channel, channel)`.
            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
            reference_vector (torch.Tensor): One-hot reference channel matrix.
            solution (str, optional): Solution to compute the MVDR beamforming weights.
                Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
                (Default: ``True``)
            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
                (Default: ``1e-8``)

        Returns:
            torch.Tensor: The MVDR beamforming weight matrix.
        r)   r*   r>   )r   r,   r   ndimr   sumr?   r@   r   _get_updated_psd_speech_get_updated_psd_noise)
r$   r   r   rE   rF   r	   r
   r   r   r   s
             r   _get_updated_mvdr_vectorzMVDR._get_updated_mvdr_vector   s   L ??[[R[(F[[R[(F::??aDJDJ$jjRj0DO$jjRj0DO#E52BHN^`hjmnn00?E//v>EDJDJ"oo

r
0BBDO"oo

r
0BBDO#E52BHN^`hjmnnr   c                     | j                   | j                   |j                  d      z   z  }d| j                   |j                  d      z   z  }| j                  |d   z  ||d   z  z   }|S )a  Update psd of speech recursively.

        Args:
            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
                Tensor with dimensions `(..., freq, channel, channel)`.
            mask_s (torch.Tensor): Time-Frequency mask of the target speech.
                Tensor with dimensions `(..., freq, time)`.

        Returns:
            torch.Tensor: The updated PSD matrix of target speech.
        rH   r*   r>   .NN)r?   rJ   r   )r$   r   rE   	numeratordenominators        r   rK   zMVDR._get_updated_psd_speech  k     OOt9K'KL	4??VZZBZ-??@

Y77%+oB^:^^r   c                     | j                   | j                   |j                  d      z   z  }d| j                   |j                  d      z   z  }| j                  |d   z  ||d   z  z   }|S )a  Update psd of noise recursively.

        Args:
            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
                Tensor with dimensions `(..., freq, channel, channel)`.
            mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
                Tensor with dimensions `(..., freq, time)`.

        Returns:
            torch.Tensor:  The updated PSD matrix of noise.
        rH   r*   r>   rO   )r@   rJ   r   )r$   r   rF   rP   rQ   s        r   rL   zMVDR._get_updated_psd_noise  rR   r   r&   c           
         |j                   }|j                  dk  rt        d|j                         |j	                         st        d|j                          |j                   t
        j                  k(  r|j                         }|t        j                  d       d|z
  }| j                  ||      }| j                  ||      }t        j                  |j                         dd |j                  t
        j                        }|d	| j                  f   j                  d       | j                   r7| j#                  |||||| j$                  | j&                  | j(                        }n.t+        |||| j$                  | j&                  | j(                        }t-        j.                  ||      }	|	j1                  |      S )
a`  Perform MVDR beamforming.

        Args:
            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
                Tensor with dimensions `(..., channel, freq, time)`
            mask_s (torch.Tensor): Time-Frequency mask of target speech.
                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
            mask_n (torch.Tensor or None, optional): Time-Frequency mask of noise.
                Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
                or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
                (Default: None)

        Returns:
            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
           z?Expected at least 3D tensor (..., channel, freq, time). Found: ziThe type of ``specgram`` tensor must be ``torch.cfloat`` or ``torch.cdouble``.                    Found: Nz=``mask_n`` is not provided, use ``1 - mask_s`` as ``mask_n``.r>   )devicedtype.)rX   rI   rA   shape
is_complexr5   cfloatcdoublewarningswarnr-   rC   sizerW   r   fill_r;   rM   r
   r:   r   r   r   apply_beamformingto)
r$   r&   rE   rF   rX   r   r   uw_mvdrspecgram_enhanceds
             r   r.   zMVDR.forward#  s   & ==1^_g_m_m^nopp""$$NN+-  >>U\\)'')H>MMYZZF6*6*KK,X__EMMZ	#t
 &&q);;22uffa@Q@QSWS`S`F &eUAt}}dFWFWY]YfYfgF//A ##E**r   )r   r   FTHz>Fr   Trf   :0yE>r!   )r/   r0   r1   r2   intstrr3   r4   r#   r5   r   rM   rK   rL   r   r.   r6   r7   s   @r   r9   r9   a   s   AJ % !!7!7 !7 	!7
 !7 !7 !7T &!%7o||7o ||7o 	7o
 7o  ,,7o 7o 7o 7o 7o 
7orU\\ 5<< TYT`T` "ELL %,, SXS_S_ $ ^b1+1+.3ll1+DLU\\DZ1+	1+r   r9   c                   J    e Zd ZdZ	 	 	 ddedededeeef   dededed	efd
Z	y)RTFMVDRa  Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the relative transfer function (RTF) matrix
    or the steering vector of target speech :math:`\bm{v}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:

    .. math::
        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)

    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin,
    :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.

    The beamforming weight is computed by:

    .. math::
        \textbf{w}_{\text{MVDR}}(f) =
        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
    r&   rtfr   reference_channelr   r   r   r   c                 f    t        j                  ||||||      }t        j                  ||      }	|	S )a  
        Args:
            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
                Tensor with dimensions `(..., channel, freq, time)`
            rtf (torch.Tensor): The complex-valued RTF vector of target speech.
                Tensor with dimensions `(..., freq, channel)`.
            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
                Tensor with dimensions `(..., freq, channel, channel)`.
            reference_channel (int or torch.Tensor): Specifies the reference channel.
                If the dtype is ``int``, it represents the reference channel index.
                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
                is one-hot.
            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
                (Default: ``True``)
            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
                (Default: ``1e-8``)

        Returns:
            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
        )r   r   ra   )
r$   r&   rm   r   rn   r   r   r   rd   spectrum_enhanceds
             r   r.   zRTFMVDR.forwardr  s<    @ ##C0ACSU]_bc//A  r   NTrf   rh   )
r/   r0   r1   r2   r   r   ri   r3   r4   r.    r   r   rl   rl   W  so    @ "&"!"! "! 	"!
 !f-"! "! "! "! 
"!r   rl   c                   ^    e Zd ZdZ	 	 	 ddedededeeef   dededed	e	j                  fd
Z
y)
SoudenMVDRa  Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
    based on the method proposed by *Souden et, al.* :cite:`souden2009optimal`.

    .. devices:: CPU CUDA

    .. properties:: Autograd TorchScript

    Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the power spectral density (PSD) matrix
    of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
    a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
    complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:

    .. math::
        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)

    where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin.

    The beamforming weight is computed by:

    .. math::
        \textbf{w}_{\text{MVDR}}(f) =
        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
    r&   r   r   rn   r   r   r   r   c                 f    t        j                  ||||||      }t        j                  ||      }	|	S )a  
        Args:
            specgram (torch.Tensor): Multi-channel complex-valued spectrum.
                Tensor with dimensions `(..., channel, freq, time)`.
            psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
                Tensor with dimensions `(..., freq, channel, channel)`.
            psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
                Tensor with dimensions `(..., freq, channel, channel)`.
            reference_channel (int or torch.Tensor): Specifies the reference channel.
                If the dtype is ``int``, it represents the reference channel index.
                If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
                is one-hot.
            diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
                (Default: ``True``)
            diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
                It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
            eps (float, optional): Value to add to the denominator in the beamforming weight formula.
                (Default: ``1e-8``)

        Returns:
            torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
        )r   r   ra   )
r$   r&   r   r   rn   r   r   r   rd   rp   s
             r   r.   zSoudenMVDR.forward  s<    @ &&ue5FHXZbdgh//A  r   Nrq   )r/   r0   r1   r2   r   r   ri   r3   r4   r5   r.   rr   r   r   rt   rt     sr    > "&"!"! "! 	"!
 !f-"! "! "! "! 
"!r   rt   rg   )r]   typingr   r   r5   r   
torchaudior   r   __all__rj   r3   r4   r   nnModuler   r9   rl   rt   rr   r   r   <module>r{      s     "   &  "!&<<&<<& ll& 	&
 & & 
& \\&R'%((// 'Ts+588?? s+l=!ehhoo =!@<! <!r   