
    Ph                     d   d dl mZmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z ddl	mZ dd	lmZ dd
lmZ ddlmZ g dZ G d de      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d d e      Zy)!    )OptionalAnyN)Tensor)	ParameterUninitializedParameterUninitializedBuffer   )
functional)init   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                        e Zd ZU dZdZg dZeed<   eed<   eed<   e	ed<   e	ed<   	 	 	 	 	 	 ddededede	de	d
d	f fdZ
ddZddZd Zd Z fdZ xZS )	_NormBasez+Common base of _InstanceNorm and _BatchNormr	   )track_running_statsmomentumepsnum_featuresaffiner   r   r   r   r   Nreturnc                 t   ||d}t         |           || _        || _        || _        || _        || _        | j
                  rIt        t        j                  |fi |      | _
        t        t        j                  |fi |      | _        n$| j                  dd        | j                  dd        | j                  r| j                  dt        j                  |fi |       | j                  dt        j                  |fi |       |  |  | j                  dt        j                   ddt        j"                  i|j%                         D 	
ci c]  \  }	}
|	dk7  s|	|
 c}
}	       |  n6| j                  dd        | j                  dd        | j                  dd        | j'                          y c c}
}	w )	Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr!   r   )super__init__r   r   r   r   r   r   torchemptyr"   r#   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters)selfr   r   r   r   r   r    r!   factory_kwargskv	__class__s              eC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/nn/modules/batchnorm.pyr)   z_NormBase.__init__   s    %+U;( #6 ;;#EKK$O$OPDK!%++l"Mn"MNDI##Hd3##FD1##  \1\^1\]  

<0Z>0Z[  !6!& "kuzz "kBPBVBVBX0iBX$!Q\]ah\hABX0i"kl   6  5  !6= 1js   F4F4c                     | j                   rP| j                  j                          | j                  j	                  d       | j
                  j                          y y Nr   )r   r$   zero_r%   fill_r&   r4   s    r9   reset_running_statsz_NormBase.reset_running_statsD   sJ    ## ##%""1%$$**, $    c                     | j                          | j                  r?t        j                  | j                         t        j
                  | j                         y y N)r?   r   r   ones_r"   zeros_r#   r>   s    r9   r3   z_NormBase.reset_parametersL   s:      ";;JJt{{#KK		" r@   c                     t         rB   )NotImplementedErrorr4   inputs     r9   _check_input_dimz_NormBase._check_input_dimR   s    !!r@   c                 :     dj                   di | j                  S )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats} )format__dict__r>   s    r9   
extra_reprz_NormBase.extra_reprU   s)    ? 88>PAEP	
r@   c           	         |j                  dd       }||dk  rU| j                  rI|dz   }	|	|vr@| j                  | j                  n$t        j                  dt        j
                        ||	<   t        
|   |||||||       y )Nversionr	   r&   r   )r!   )getr   r&   r*   r0   r1   r(   _load_from_state_dict)r4   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrP   num_batches_tracked_keyr8   s             r9   rR   z_NormBase._load_from_state_dict[   s     !$$Y5Ow{0H0H '-/D&D#&j8 //; ,,auzz: 23 	%	
r@   h㈵>皙?TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatboolr)   r?   r3   rI   rN   rR   __classcell__r8   s   @r9   r   r      s    5HXM	JOL $($ $  $  	$ 
 $  "$  
$ L-#"

 
r@   r   c                   V     e Zd Z	 	 	 	 	 	 ddedededededdf fdZd	edefd
Z xZ	S )
_BatchNormNr   r   r   r   r   r   c                 8    ||d}t        	|   |||||fi | y Nr   )r(   r)   )
r4   r   r   r   r   r   r    r!   r5   r8   s
            r9   r)   z_BatchNorm.__init__~   s0     %+U;#x1D	
HV	
r@   rH   c           
         | j                  |       | j                  d}n| j                  }| j                  rd| j                  rX| j                  L| j                  j                  d       | j                  dt        | j                        z  }n| j                  }	 | j                  rd}n| j                  d u xr | j                  d u }	 t        j                  || j                  r| j                  r| j                  nd | j                  r| j                  r| j                  nd | j                  | j                  ||| j                        S )N        r         ?T)rI   r   trainingr   r&   add_rg   r$   r%   F
batch_normr"   r#   r   )r4   rH   exponential_average_factorbn_trainings       r9   forwardz_BatchNorm.forward   s'   e$
 == ),&)-&==T55''3((--a0==(14uT=U=U7V1V.15.	 ==K,,4T4;K;Kt;SK	
 || ==D$<$< $(MMT5M5MDSWKKII&HH
 	
r@   r[   )
r_   r`   ra   re   rg   rh   r)   r   rx   ri   rj   s   @r9   rl   rl   }   si     $(

 
 	

 
 "
 

.
V .
 .
r@   rl   c                   R     e Zd ZU eed<   eed<   	 	 d	 d fdZd fdZddZ xZS )_LazyNormBaser"   r#   c           
         ||d}t        
|   d||ddfi | || _        || _        | j                  r t	        di || _        t	        di || _        | j                  rtt        di || _        t        di || _	        t        j                  	 ddt        j                  i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}| _        y y c c}	}w )Nr   r   Fr!   rK   r'   )r(   r)   r   r   r   r"   r#   r   r$   r%   r*   r0   r1   r2   r&   )r4   r   r   r   r   r    r!   r5   r6   r7   r8   s             r9   r)   z_LazyNormBase.__init__   s    $*U; 		
 		
 #6 ;;0B>BDK.@@DI## 3 En ED2D^DD',||(b(b9G9M9M9O'`9OASTX_S_19O'`(bD$ $ (as   6CCc                 d    | j                         s| j                  dk7  rt        |           y y y )Nr   )has_uninitialized_paramsr   r(   r3   )r4   r8   s    r9   r3   z_LazyNormBase.reset_parameters   s0    ,,.43D3D3IG$& 4J.r@   c                 @   | j                         r|j                  d   | _        | j                  rt	        | j
                  t              sJ t	        | j                  t              sJ | j
                  j                  | j                  f       | j                  j                  | j                  f       | j                  rL| j                  j                  | j                  f       | j                  j                  | j                  f       | j                          y y r;   )r}   shaper   r   
isinstancer"   r   r#   materializer   r$   r%   r3   rG   s     r9   initialize_parametersz#_LazyNormBase.initialize_parameters   s    ((* %AD{{!$++/EFFF!$))-CDDD''):):(<=		%%t'8'8&:;''!!--t/@/@.BC  ,,d.?.?-AB!!# +r@   r[   r^   )	r_   r`   ra   r   rf   r)   r3   r   ri   rj   s   @r9   rz   rz      s.    ""
  PT$(b-1b0'$r@   rz   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 2D or 3D input as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    c                     |j                         dk7  r1|j                         dk7  rt        d|j                          d      y y Nr	      zexpected 2D or 3D input (got D input)dim
ValueErrorrG   s     r9   rI   zBatchNorm1d._check_input_dim2  D    99;!		q 0/		}HE  !1r@   Nr_   r`   ra   rb   rI   rK   r@   r9   r   r      s    BHr@   r   c                       e Zd ZdZeZd Zy)r   a6  A :class:`torch.nn.BatchNorm1d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                     |j                         dk7  r1|j                         dk7  rt        d|j                          d      y y r   r   rG   s     r9   rI   z LazyBatchNorm1d._check_input_dimU  r   r@   N)r_   r`   ra   rb   r   cls_to_becomerI   rK   r@   r9   r   r   9  s    2  Mr@   r   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    c                 d    |j                         dk7  rt        d|j                          d      y N   zexpected 4D input (got r   r   rG   s     r9   rI   zBatchNorm2d._check_input_dim  0    99;!6uyy{m8LMM r@   Nr   rK   r@   r9   r   r   \  s    CJNr@   r   c                       e Zd ZdZeZd Zy)r   a6  A :class:`torch.nn.BatchNorm2d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 d    |j                         dk7  rt        d|j                          d      y r   r   rG   s     r9   rI   z LazyBatchNorm2d._check_input_dim  r   r@   N)r_   r`   ra   rb   r   r   rI   rK   r@   r9   r   r         2  MNr@   r   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 5D input (a mini-batch of 3D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    c                 d    |j                         dk7  rt        d|j                          d      y N   zexpected 5D input (got r   r   rG   s     r9   rI   zBatchNorm3d._check_input_dim  r   r@   Nr   rK   r@   r9   r   r     s    DLNr@   r   c                       e Zd ZdZeZd Zy)r   a6  A :class:`torch.nn.BatchNorm3d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 d    |j                         dk7  rt        d|j                          d      y r   r   rG   s     r9   rI   z LazyBatchNorm3d._check_input_dim0  r   r@   N)r_   r`   ra   rb   r   r   rI   rK   r@   r9   r   r     r   r@   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddedededededee   d	df fd
Z	d Z
d Zded	efdZedd       Z xZS )r   a   Applies Batch Normalization over a N-Dimensional input (a mini-batch of [N-2]D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    Nr   r   r   r   r   process_groupr   c	                 F    ||d}	t        
|   |||||fi |	 || _        y rn   )r(   r)   r   )r4   r   r   r   r   r   r   r    r!   r5   r8   s             r9   r)   zSyncBatchNorm.__init__  s:     %+U;#x1D	
HV	
 +r@   c                 d    |j                         dk  rt        d|j                          d      y )Nr	   z expected at least 2D input (got r   r   rG   s     r9   rI   zSyncBatchNorm._check_input_dim  s3    99;?2599;-xH  r@   c                 B    |j                  d      dk(  rt        d      y )Nr   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rG   s     r9   _check_non_zero_input_channelsz,SyncBatchNorm._check_non_zero_input_channels  s'    ::a=AK  r@   rH   c                 z   | j                  |       | j                  |       | j                  d}n| j                  }| j                  rk| j                  r_| j
                  J | j
                  j                  d       | j                  d| j
                  j                         z  }n| j                  }	 | j                  rd}n| j                  d u xr | j                  d u }	 | j                  r| j                  r| j                  nd }| j                  r| j                  r| j                  nd }|xrL | j                  xr> t        j                  j                         xr t        j                  j                         }|r|j                  j                  dt        j                   j#                         fvr*t%        dt        j                   j#                                t        j                  j&                  j(                  }| j*                  r| j*                  }t        j                  j-                  |      }|dkD  }|s:t/        j0                  |||| j2                  | j4                  ||| j6                        S |sJ t9        j:                  || j2                  | j4                  ||| j6                  |	      S )Nrp   r   rq   Tcudaz4SyncBatchNorm expected input tensor to be on GPU or )rI   r   r   rr   r   r&   rs   itemr$   r%   r*   distributedis_availableis_initializedr    type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizert   ru   r"   r#   r   sync_batch_normapply)	r4   rH   rv   rw   r$   r%   	need_syncr   
world_sizes	            r9   rx   zSyncBatchNorm.forward  sy   e$++E2
 == ),&)-&==T55++777$$))!,}}$-043K3K3P3P3R-R*-1]]*	 ==K,,4T4;K;Kt;SK	 &*]]d6N6NDTX 	 %)MMT5M5MDSW 	
 ! ]T]] ]&&335]:?:K:K:Z:Z:\ 	||  1W1W1Y(ZZ !W$)HH$J$J$L#M"O P P "--3399M!! $ 2 2**99-HJ"QI <<		*	 	 ;"((		*
 
r@   c                    |}t        |t        j                  j                  j                  j
                        rt        j                  j                  |j                  |j                  |j                  |j                  |j                  |      }|j                  r?t        j                         5  |j                  |_        |j                  |_        ddd       |j                  |_        |j                   |_        |j"                  |_        |j$                  |_        t'        |d      r|j(                  |_        |j+                         D ]'  \  }}|j-                  || j/                  ||             ) ~|S # 1 sw Y   xY w)a{  Helper function to convert all :attr:`BatchNorm*D` layers in the model to
        :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nqconfig)r   r*   nnmodules	batchnormrl   r   r   r   r   r   r   no_gradr"   r#   r$   r%   r&   rr   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r9   r   z$SyncBatchNorm.convert_sync_batchnorm  s/   J fehh..88CCD!HH22##

**M }}]]_+1==M()/M& % *0)<)<M&(.(:(:M%060J0JM-%+__M"vy)(.%!002KD%$$c00F 3  %_s   2#E;;F)r\   r]   TTNNNrB   )r_   r`   ra   rb   re   rg   rh   r   r   r)   rI   r   r   rx   classmethodr   ri   rj   s   @r9   r   r   5  s    cP $('+++ + 	+
 + "+  }+ 
+"QV Q Qf = =r@   r   )typingr   r   r*   r   torch.nn.parameterr   r   r    r
   rt   r   
_functionsr   r   lazyr   r   r   __all__r   rl   rz   r   r   r   r   r   r   rK   r@   r9   <module>r      s        U U   8 ! /i
 i
X>
 >
B,$OY ,$^I* IX mZ  FHN* HNVNmZ NBIN* INXNmZ NBTJ Tr@   