
    JTh*                         S SK r S SKrS SKJrJrJrJrJrJr  S SK	J
r
  SSS.S jjrSS jrSS jrSS jrSSS.S	 jjrSSS.S
 jjrg)    N)_flatten_dense_tensors_get_device_index_handle_complex_reorder_tensors_as_take_tensors_unflatten_dense_tensors)nccl)outc                   [        U 5      n USL USL -  (       d  [        SU SU 35      eUb:  U Vs/ s H  n[        U5      PM     nn[        R                  R                  X5      $ [        R                  R                  X5      $ s  snf )a  Broadcasts a tensor to specified GPU devices.

Args:
    tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
    devices (Iterable[torch.device, str or int], optional): an iterable of
      GPU devices, among which to broadcast.
    out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
      store output results.

.. note::
    Exactly one of :attr:`devices` and :attr:`out` must be specified.

Returns:
    - If :attr:`devices` is specified,
        a tuple containing copies of :attr:`tensor`, placed on
        :attr:`devices`.
    - If :attr:`out` is specified,
        a tuple containing :attr:`out` tensors, each containing a copy of
        :attr:`tensor`.
NzFExactly one of 'devices' and 'out' must be specified, but got devices=z	 and out=)r   RuntimeErrorr   torch_C
_broadcast_broadcast_out)tensordevicesr
   ds       N/var/www/auris/envauris/lib/python3.13/site-packages/torch/nn/parallel/comm.py	broadcastr      s    * V$F_-TU\T]]fgjfkl
 	
 189A$Q'9xx""633xx&&v33 :s   Bc                     U Vs/ s H  n[        U5      PM     nnU  Vs/ s H  n[        U5      PM     n n[        R                  R	                  XU5      $ s  snf s  snf )a  Broadcast a sequence of tensors to the specified GPUs.

Small tensors are first coalesced into a buffer to reduce the number of synchronizations.

Args:
    tensors (sequence): tensors to broadcast. Must be on the same device,
      either CPU or GPU.
    devices (Iterable[torch.device, str or int]): an iterable of GPU
      devices, among which to broadcast.
    buffer_size (int): maximum size of the buffer used for coalescing

Returns:
    A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
)r   r   r   r   _broadcast_coalesced)tensorsr   buffer_sizer   ts        r   broadcast_coalescedr   1   sV     .55W #WG5+237aq!7G388((;GG 63s
   AAc           	         [        USS9nU S   R                  5       nSn[        U 5       H  u  pEUR                  R                  S:w  d   S5       eUR                  5       U:X  a  UnUR                  5       U:w  d  MR  SR                  S UR                  5        5       5      nSR                  S	 U 5       5      n[        S
U SU SU 35      e   Uc  [        S5      e[        U 5      S:X  a  U S   $ [        R                  " U 5      (       a/  [        R                  " X   5      n[        R                  " XUS9  U$ [        R                  " X   R                  R                  U5      n	[        U 5       VV
s/ s H  u  pJXC:w  d  M  U
PM     nnn
X   US   R                  U	SS9-   nUSS  H"  nUR!                  UR                  U	SS95        M$     U$ s  sn
nf )a  Sum tensors from multiple GPUs.

All inputs should have matching shapes, dtype, and layout. The output tensor
will be of the same shape, dtype, and layout.

Args:
    inputs (Iterable[Tensor]): an iterable of tensors to add.
    destination (int, optional): a device on which the output will be
        placed (default: current device).

Returns:
    A tensor containing an elementwise sum of all inputs, placed on the
    :attr:`destination` device.
T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc              3   8   #    U  H  n[        U5      v   M     g 7fNstr.0r   s     r   	<genexpr>reduce_add.<locals>.<genexpr>\   s     6:a3q66:   c              3   8   #    U  H  n[        U5      v   M     g 7fr!   r"   r$   s     r   r&   r'   ]   s     ;
1A
r(   zinput z has invalid size: got z, but expected zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputroot)devicenon_blocking)r   size	enumerater-   type
get_devicejoin
ValueErrorr   lenr	   is_availabler   
empty_likereducetoadd_)inputsdestination
input_size
root_indexiinpgotexpectedresultdestination_devicer   nonrootothers                r   
reduce_addrG   E   s    $K$?K!JJF#zz%'V)VV'>>{*J88:#((6388:66Cxx;
;;H23%xjQ  $ Z
 	
 6{aay  !!&"45F
; M #\\&*<*C*C*H*H+V!*6!2F!2ao1!2F#gajmm%D '4 '
 
 QR[EKK(:NO !M Gs   8GGc                    U  Vs/ s H  n/ PM     nn/ n/ n[        U 6  H  n[        S U 5       5      (       a2  [        Xq5      nUR                  U5        UR                  US   5        ML  [        XG5       H7  u  pU	R                  U
R                  (       a  U
R                  5       OU
5        M9     UR                  US   S   5        M     U Vs/ s H  n[        X5      PM     nn[        U6  HZ  nU Vs/ s H  n[        U5      PM     nn[        X5      n[        UUS   5       H  n
UR                  U
R                  5        M      M\     [        [        XV5      5      $ s  snf s  snf s  snf )a,  Sum tensors from multiple GPUs.

Small tensors are first coalesced into a buffer to reduce the number
of synchronizations.

Args:
    inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
        contain tensors from a single device.
    destination (int, optional): a device on which the output will be
        placed (default: current device).
    buffer_size (int): maximum size of the buffer used for coalescing

Returns:
    A tuple of tensors containing an elementwise sum of each group of
    inputs, placed on the ``destination`` device.
c              3   8   #    U  H  oR                   v   M     g 7fr!   )	is_sparse)r%   r   s     r   r&   'reduce_add_coalesced.<locals>.<genexpr>   s     3Nq{{Nr(   r   )zipallrG   appendrJ   to_denser   r   r   datatupler   )r;   r<   r   _dense_tensorsr+   	ref_ordertensor_at_gpusrC   collr   r   itrschunkschunkflat_tensorsflat_results                    r   reduce_add_coalescedr]   x   sQ   & .4 4VVM 4FIv,3N333<FMM&!^A./}=AKKAJJLQ? >]1-b12 ' @MM}GM'/}DMt*7=
7=e"5)v 	 
 !;)+vayAA MM!&&!	 B  $V7883 !5 N
s   E!E&6E+c          	      H   [        U 5      n UcE  U Vs/ s H  n[        U5      PM     nn[        [        R                  R                  XX#U5      5      $ Ub  [        SU 35      eUb  [        SU 35      e[        [        R                  R                  XX45      5      $ s  snf )a  Scatters tensor across multiple GPUs.

Args:
    tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
    devices (Iterable[torch.device, str or int], optional): an iterable of
      GPU devices, among which to scatter.
    chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
      each device. It should match :attr:`devices` in length and sums to
      ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
      into equal chunks.
    dim (int, optional): A dimension along which to chunk :attr:`tensor`.
      Default: ``0``.
    streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
      which to execute the scatter. If not specified, the default stream will
      be utilized.
    out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
      store output results. Sizes of these tensors must match that of
      :attr:`tensor`, except for :attr:`dim`, where the total size must
      sum to ``tensor.size(dim)``.

.. note::
    Exactly one of :attr:`devices` and :attr:`out` must be specified. When
    :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
    will be inferred from sizes of :attr:`out`.

Returns:
    - If :attr:`devices` is specified,
        a tuple containing chunks of :attr:`tensor`, placed on
        :attr:`devices`.
    - If :attr:`out` is specified,
        a tuple containing :attr:`out` tensors, each containing a chunk of
        :attr:`tensor`.
zI'devices' must not be specified when 'out' is specified, but got devices=zQ'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes=)r   r   rR   r   r   _scatterr   _scatter_out)r   r   chunk_sizesdimstreamsr
   r   s          r   scatterrd      s    D V$F
{189A$Q'9UXX&&v'RSS[\c[de  "cdocpq  UXX**6EFF :s   Bc                @   U  Vs/ s H  n[        U5      PM     n nUcK  US:X  a  [        R                  " S[        SS9  [	        USSS9n[
        R                  R                  XU5      $ Ub  [        SU 35      e[
        R                  R                  XU5      $ s  snf )a^  Gathers tensors from multiple GPU devices.

Args:
    tensors (Iterable[Tensor]): an iterable of tensors to gather.
      Tensor sizes in all dimensions other than :attr:`dim` have to match.
    dim (int, optional): a dimension along which the tensors will be
      concatenated. Default: ``0``.
    destination (torch.device, str, or int, optional): the output device.
      Can be CPU or CUDA. Default: the current CUDA device.
    out (Tensor, optional, keyword-only): the tensor to store gather result.
      Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
      where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
      Can be on CPU or CUDA.

.. note::
    :attr:`destination` must not be specified when :attr:`out` is specified.

Returns:
    - If :attr:`destination` is specified,
        a tensor located on :attr:`destination` device, that is a result of
        concatenating :attr:`tensors` along :attr:`dim`.
    - If :attr:`out` is specified,
        the :attr:`out` tensor, now containing results of concatenating
        :attr:`tensors` along :attr:`dim`.
rL   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".   )
stacklevelT)	allow_cpur   zQ'destination' must not be specified when 'out' is specified, but got destination=)
r   warningswarnFutureWarningr   r   r   _gatherr   _gather_out)r   rb   r<   r
   r   s        r   gatherrn      s    4 ,337aq!7G3
{"MM@	 (tdSxxk::"cdocpq  xx##G#66! 4s   Br!   )   )Nro   )NNr   N)r   N)ri   r   torch._utilsr   r   r   r   r   r   
torch.cudar	   r   r   rG   r]   rd   rn        r   <module>rt      sU       44 4BH(0f,9^/GPT /Gd*7D *7rs   