
    JTh                     \   S SK JrJr  S SKrS SKJr  S SKJrJr  S SK	J
r
JrJr  SSKJr  SSKJr  SS	KJr  / S
Qr " S S\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      rg)    )AnyOptionalN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                      ^  \ rS rSr% SrSr/ SQr\\S'   \	\S'   \
\	   \S'   \\S'   \\S	'         SS\S\	S\
\	   S\S	\SS
4U 4S jjjrSS jrSS jrS rS rU 4S jrSrU =r$ )	_NormBase   z,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   r   r   r   Nreturnc                   > XgS.n[         TU ]  5         Xl        X l        X0l        X@l        XPl        U R
                  (       aK  [        [        R                  " U40 UD65      U l
        [        [        R                  " U40 UD65      U l        O$U R                  SS 5        U R                  SS 5        U R                  (       a  U R                  S[        R                  " U40 UD65        U R                  S[        R                  " U40 UD65        U   U   U R                  S[        R                   "  SS[        R"                  0UR%                  5        V	V
s0 s H  u  pU	S:w  d  M  X_M     sn
n	D65        U   O6U R                  SS 5        U R                  SS 5        U R                  SS 5        U R'                  5         g s  sn
n	f )	Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr"   r   )super__init__r   r   r   r   r   r   torchemptyr#   r$   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters)selfr   r   r   r   r   r!   r"   factory_kwargskv	__class__s              R/var/www/auris/envauris/lib/python3.13/site-packages/torch/nn/modules/batchnorm.pyr*   _NormBase.__init__&   s    %+;( #6 ;;#EKK$O$OPDK!%++l"Mn"MNDI##Hd3##FD1##  L KN K   uzz,I.I   %** )7(<(<(>O(>!w,tqt(>O   6  5  !6= Ps   G'Gc                     U R                   (       aP  U R                  R                  5         U R                  R	                  S5        U R
                  R                  5         g g Nr   )r   r%   zero_r&   fill_r'   r5   s    r:   reset_running_stats_NormBase.reset_running_statsU   sJ    ## ##%""1%$$**, $    c                     U R                  5         U R                  (       aA  [        R                  " U R                  5        [        R
                  " U R                  5        g g N)rA   r   r   ones_r#   zeros_r$   r@   s    r:   r4   _NormBase.reset_parameters]   s:      ";;JJt{{#KK		" rC   c                     [         erE   )NotImplementedErrorr5   inputs     r:   _check_input_dim_NormBase._check_input_dimc   s    !!rC   c                 :    SR                   " S0 U R                  D6$ )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats} )format__dict__r@   s    r:   
extra_repr_NormBase.extra_reprf   s)    88>? PAEP	
rC   c           	      t  > UR                  SS 5      nUb  US:  a  U R                  (       av  US-   n	X;  al  U R                  b:  U R                  R                  [        R                  " S5      :w  a  U R                  O"[        R
                  " S[        R                  S9X'   [        T
U ]!  UUUUUUU5        g )Nversionr   r'   metar   )r"   )	getr   r'   r!   r+   r1   r2   r)   _load_from_state_dict)r5   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrV   num_batches_tracked_keyr9   s             r:   rY   _NormBase._load_from_state_dictl   s     !$$Y5Ow{0H0H '-/D&D#&8 //;00775<<;OO ,, auzz:	 3 	%	
rC   )r   r$   r   r   r   r   r#   h㈵>皙?TTNNr   N)__name__
__module____qualname____firstlineno____doc___version__constants__int__annotations__floatr   boolr*   rA   r4   rM   rS   rY   __static_attributes____classcell__r9   s   @r:   r   r      s    6HXM	JuoL $'$(- -  -  5/	- 
 -  "-  
-  - ^-#"
 
  
rC   r   c                   l   ^  \ rS rSr      SS\S\S\\   S\S\SS4U 4S	 jjjrS
\	S\	4S jr
SrU =r$ )
_BatchNorm   Nr   r   r   r   r   r   c                 4   > XgS.n[         T	U ]  " XX4U40 UD6  g Nr    )r)   r*   )
r5   r   r   r   r   r   r!   r"   r6   r9   s
            r:   r*   _BatchNorm.__init__   s*     %+;x1D	
HV	
rC   rL   c           
         U R                  U5        U R                  c  SnOU R                  nU R                  (       ak  U R                  (       aZ  U R                  bM  U R                  R                  S5        U R                  c  S[        U R                  5      -  nOU R                  n U R                  (       a  SnO#U R                  S L =(       a    U R                  S L n [        R                  " UU R                  (       a  U R                  (       a  U R                  OS U R                  (       a  U R                  (       a  U R                  OS U R                  U R                  UUU R                  5      $ )N        r         ?T)rM   r   trainingr   r'   add_rp   r%   r&   F
batch_normr#   r$   r   )r5   rL   exponential_average_factorbn_trainings       r:   forward_BatchNorm.forward   s'   e$
 == ),&)-&==T55''3((--a0==(14uT=U=U7V1V.15.	 ==K,,4T4;K;Kt;SK	
 || ==D$<$< $(MMT5M5MDSWKKII&HH
 	
rC   rP   rc   )rg   rh   ri   rj   rn   rp   r   rq   r*   r   r   rr   rs   rt   s   @r:   rv   rv      sw     $'$(

 
 5/	

 
 "
 

 
.
V .
 .
 .
rC   rv   c                   n   ^  \ rS rSr% \\S'   \\S'         S S	U 4S jjjrS	U 4S jjrS	S jrSr	U =r
$ )
_LazyNormBase   r#   r$   c           
        > XVS.n[         T
U ]  " SUUSS40 UD6  X0l        X@l        U R                  (       a   [	        S0 UD6U l        [	        S0 UD6U l        U R                  (       ax  [        S0 UD6U l        [        S0 UD6U l	        [        R                  "  SS[        R                  0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6U l        g g s  sn	nf )Nr    r   Fr"   rP   r(   )r)   r*   r   r   r
   r#   r$   r	   r%   r&   r+   r1   r2   r3   r'   )r5   r   r   r   r   r!   r"   r6   r7   r8   r9   s             r:   r*   _LazyNormBase.__init__   s     %+; 		
 		
 #6 ;;0B>BDK.@@DI## 3 En ED2D^DD',||(jj( %3$8$8$:K$:DAa7l414$:K(D$ $ Ls   ?C"C"c                 p   > U R                  5       (       d   U R                  S:w  a  [        TU ]  5         g g g )Nr   )has_uninitialized_paramsr   r)   r4   )r5   r9   s    r:   r4   _LazyNormBase.reset_parameters   s3    ,,..43D3D3IG$& 4J.rC   c                 r   U R                  5       (       Ga!  UR                  S   U l        U R                  (       a  [	        U R
                  [        5      (       d   e[	        U R                  [        5      (       d   eU R
                  R                  U R                  45        U R                  R                  U R                  45        U R                  (       aL  U R                  R                  U R                  45        U R                  R                  U R                  45        U R                  5         g g r=   )r   shaper   r   
isinstancer#   r
   r$   materializer   r%   r&   r4   rK   s     r:   initialize_parameters#_LazyNormBase.initialize_parameters   s    ((** %AD{{!$++/EFFFF!$))-CDDDD''):):(<=		%%t'8'8&:;''!!--&&(   ,,&&( !!# +rC   )r   r$   r'   r   r%   r&   r   r#   rc   rf   )rg   rh   ri   rj   r
   ro   r*   r4   r   rr   rs   rt   s   @r:   r   r      sG    ""
      
   D'$ $rC   r   c                       \ rS rSrSrS rSrg)r   i  a  Applies Batch Normalization over a 2D or 3D input.

Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the number of features or channels of the input). By default, the
elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
At train time in the forward pass, the variance is calculated via the biased estimator,
equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
moving average of the variance is calculated via the unbiased  estimator, equivalent to
``torch.var(input, unbiased=True)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

Args:
    num_features: number of features or channels :math:`C` of the input
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
      :math:`C` is the number of features or channels, and :math:`L` is the sequence length
    - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm1d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm1d(100, affine=False)
    >>> input = torch.randn(20, 100)
    >>> output = m(input)
c                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrK   s     r:   rM   BatchNorm1d._check_input_dimS  @    99;!		q 0<UYY[MRSS !1rC   rP   Nrg   rh   ri   rj   rk   rM   rr   rP   rC   r:   r   r     s    DLTrC   r   c                   "    \ rS rSrSr\rS rSrg)r   iX  a  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
c                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g r   r   rK   s     r:   rM    LazyBatchNorm1d._check_input_dimu  r   rC   rP   N)	rg   rh   ri   rj   rk   r   cls_to_becomerM   rr   rP   rC   r:   r   r   X  s    4  MTrC   r   c                       \ rS rSrSrS rSrg)r   iz  a  Applies Batch Normalization over a 4D input.

4D is a mini-batch of 2D inputs
with additional channel dimension. Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, unbiased=True)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C, H, W)`
    - Output: :math:`(N, C, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm2d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm2d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45)
    >>> output = m(input)
c                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 4D input (got r   r   rK   s     r:   rM   BatchNorm2d._check_input_dim  0    99;!6uyy{m8LMM rC   rP   Nr   rP   rC   r:   r   r   z      ENNrC   r   c                   "    \ rS rSrSr\rS rSrg)r   i  a  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
c                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rK   s     r:   rM    LazyBatchNorm2d._check_input_dim  r   rC   rP   N)	rg   rh   ri   rj   rk   r   r   rM   rr   rP   rC   r:   r   r         4  MNrC   r   c                       \ rS rSrSrS rSrg)r   i  a  Applies Batch Normalization over a 5D input.

5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, unbiased=True)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
or Spatio-temporal Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, D, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C, D, H, W)`
    - Output: :math:`(N, C, D, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm3d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)
c                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 5D input (got r   r   rK   s     r:   rM   BatchNorm3d._check_input_dim1  r   rC   rP   Nr   rP   rC   r:   r   r     r   rC   r   c                   "    \ rS rSrSr\rS rSrg)r   i6  a  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
c                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rK   s     r:   rM    LazyBatchNorm3d._check_input_dimS  r   rC   rP   N)	rg   rh   ri   rj   rk   r   r   rM   rr   rP   rC   r:   r   r   6  r   rC   r   c                      ^  \ rS rSrSr       SS\S\S\\   S\S\S	\\	   S
S4U 4S jjjr
S rS rS\S
\4S jr\SS j5       rSrU =r$ )r   iX  a  Applies Batch Normalization over a N-Dimensional input.

The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over all
mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
are learnable parameter vectors of size `C` (where `C` is the input size).
By default, the elements of :math:`\gamma` are sampled from
:math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
Normalization or Spatio-temporal Batch Normalization.

Currently :class:`SyncBatchNorm` only supports
:class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
:meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
:attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
Network with DDP.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, +)`
    eps: a value added to the denominator for numerical stability.
        Default: ``1e-5``
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    process_group: synchronization of stats happen within each process group
        individually. Default behavior is synchronization across the whole
        world

Shape:
    - Input: :math:`(N, C, +)`
    - Output: :math:`(N, C, +)` (same shape as input)

.. note::
    Synchronization of batchnorm statistics occurs only while training, i.e.
    synchronization is disabled when ``model.eval()`` is set or if
    ``self.training`` is otherwise ``False``.

Examples::

    >>> # xdoctest: +SKIP
    >>> # With Learnable Parameters
    >>> m = nn.SyncBatchNorm(100)
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)

    >>> # network is nn.BatchNorm layer
    >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
    >>> # only single gpu per process is currently supported
    >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
    >>>                         sync_bn_network,
    >>>                         device_ids=[args.local_rank],
    >>>                         output_device=args.local_rank)
Nr   r   r   r   r   process_groupr   c	                 @   > XxS.n	[         T
U ]  " XX4U40 U	D6  X`l        g ry   )r)   r*   r   )r5   r   r   r   r   r   r   r!   r"   r6   r9   s             r:   r*   SyncBatchNorm.__init__  s2     %+;x1D	
HV	
 +rC   c                 f    UR                  5       S:  a  [        SUR                  5        S35      eg )Nr   z expected at least 2D input (got r   r   rK   s     r:   rM   SyncBatchNorm._check_input_dim  s/    99;??		}HUVV rC   c                 D    UR                  S5      S:X  a  [        S5      eg )Nr   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rK   s     r:   _check_non_zero_input_channels,SyncBatchNorm._check_non_zero_input_channels  s'    ::a=AK  rC   rL   c                     U R                  U5        U R                  U5        U R                  c  SnOU R                  nU R                  (       ar  U R                  (       aa  U R
                  c   eU R
                  R                  S5        U R                  c  SU R
                  R                  5       -  nOU R                  n U R                  (       a  SnO#U R                  S L =(       a    U R                  S L n U R                  (       a  U R                  (       a  U R                  OS nU R                  (       a  U R                  (       a  U R                  OS nU=(       aV    U R                  =(       aC    [        R                  R                  5       =(       a    [        R                  R                  5       nU(       a  UR                  R                  SS[        R                   R#                  5       4;  a*  [%        S[        R                   R#                  5        35      e[        R                  R&                  R(                  nU R*                  (       a  U R*                  n[        R                  R-                  U5      nUS:  nU(       d;  [.        R0                  " UUUU R2                  U R4                  UUU R6                  5      $ U(       d   e[8        R:                  " UU R2                  U R4                  UUU R6                  UWW5	      $ )Nr|   r   r}   Tcudaxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or )rM   r   r   r~   r   r'   r   itemr%   r&   r+   distributedis_availableis_initializedr!   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizer   r   r#   r$   r   sync_batch_normapply)	r5   rL   r   r   r%   r&   	need_syncr   
world_sizes	            r:   r   SyncBatchNorm.forward  s   e$++E2
 == ),&)-&==T55++777$$))!,}}$-043K3K3P3P3R-R*-1]]*	 ==K,,4T4;K;Kt;SK	 &*]]d6N6NDTX 	 %)MMT5M5MDSW 	  33!!..03 !!002	 	 ||  668) 
 !Qxx==?@B 
 "--3399M!! $ 2 2**99-HJ"QI <<		*	 	 ;"((		*
 
rC   c                 6   Un[        U[        R                  R                  R                  R
                  5      (       Ga  [        R                  R                  UR                  UR                  UR                  UR                  UR                  U5      nUR                  (       a@  [        R                  " 5          UR                  Ul        UR                  Ul        SSS5        UR                  Ul        UR                   Ul        UR"                  Ul        UR$                  Ul        ['        US5      (       a  UR(                  Ul        UR+                  5        H%  u  pEUR-                  X@R/                  XR5      5        M'     AU$ ! , (       d  f       N= f)a  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

Args:
    module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
    process_group (optional): process group to scope synchronization,
        default is the whole world

Returns:
    The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
    layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
    a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
    instead.

Example::

    >>> # Network with nn.BatchNorm layer
    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> module = torch.nn.Sequential(
    >>>            torch.nn.Linear(20, 100),
    >>>            torch.nn.BatchNorm1d(100),
    >>>          ).cuda()
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> # xdoctest: +SKIP("distributed")
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

Nqconfig)r   r+   nnmodules	batchnormrv   r   r   r   r   r   r   no_gradr#   r$   r%   r&   r'   r~   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r:   r   $SyncBatchNorm.convert_sync_batchnorm7  s/   H fehh..88CCDD!HH22##

**M }}]]_+1==M()/M& % *0)<)<M&(.(:(:M%060J0JM-%+__M"vy))(.%!002KD$$00F 3  %_s   =#F


F)r   )rd   re   TTNNNrE   )rg   rh   ri   rj   rk   rn   rp   r   rq   r   r*   rM   r   r   r   classmethodr   rr   rs   rt   s   @r:   r   r   X  s    dR $'$('+++ + 5/	+
 + "+  }+ 
+ +"W[V [ [z < <rC   r   )typingr   r   r+   r   torch.nnr   r   r   torch.nn.parameterr   r	   r
   
_functionsr   r   lazyr   r   r   __all__r   rv   r   r   r   r   r   r   r   rP   rC   r:   <module>r      s        * U U 8 ! s
 s
l>
 >
B9$OY 9$xIT* ITXTmZ TDJN* JNZNmZ NDJN* JNZNmZ ND\J \rC   