o
    wZhM                     @   s   U d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ G dd de	ZG d	d
 d
e	ZG dd de	ZG dd de	Zdaeeeej   ed< dejfddZdS )    N)chain)Optional_get_device_index)Function)commc                   @   $   e Zd Zedd Zedd ZdS )	Broadcastc                    s   t dd |D sJ ddd |D }|| _t|dkrdS t|| _|d  | _t|| j}g }t| j	dd  D ]\ }|sO|
 fd	d|D  q=| j|  tt|S )
Nc                 s       | ]	}|j jd kV  qdS cpuNdevicetype.0i r   K/var/www/auris/lib/python3.10/site-packages/torch/nn/parallel/_functions.py	<genexpr>       
z$Broadcast.forward.<locals>.<genexpr>z2Broadcast function not implemented for CPU tensorsc                 S      g | ]}t |d qS Tr   r   xr   r   r   
<listcomp>       z%Broadcast.forward.<locals>.<listcomp>r   r      c                 3   s    | ]}|  V  qd S Nr   )r   outputidxr   r   r          )alltarget_gpuslen
num_inputs
get_deviceinput_devicer   Zbroadcast_coalesced	enumerateZneeds_input_gradextendZmark_non_differentiabletupler   from_iterable)ctxr$   inputsoutputsZnon_differentiablesZinput_requires_gradr   r    r   forward   s&   


zBroadcast.forwardc                 G   s   dt j| j| jg|R   S )Nr   )ReduceAddCoalescedapplyr(   r&   r-   Zgrad_outputsr   r   r   backward   s
   
zBroadcast.backwardN__name__
__module____qualname__staticmethodr0   r4   r   r   r   r   r	      s
    
r	   c                   @   r   )r1   c                    sL    fddt dt D | _ fddt dt D }t||S )Nc                    s   g | ]} |   qS r   r'   r   )gradsr   r   r   )   s    z.ReduceAddCoalesced.forward.<locals>.<listcomp>r   c                    s   g | ]
} ||  qS r   r   r   r;   r&   r   r   r   -   s    )ranger%   r$   r   Zreduce_add_coalesced)r-   destinationr&   r;   Zgrads_r   r<   r   r0   '   s
   
 zReduceAddCoalesced.forwardc                 G   s   dt j| jg|R   S )NNN)r	   r2   r$   r3   r   r   r   r4   0   s   zReduceAddCoalesced.backwardNr5   r   r   r   r   r1   &   s
    
r1   c                   @   r   )Gatherc                    s   t dd |D sJ d|dkrd _nt|d}| _| _tdd |D  _t dd |D rI|dkrItd	d |D }td
 d _nd _t fdd|D  _	t
| j jS )Nc                 s   r
   r   r   r   r   r   r   r   ;   r   z!Gather.forward.<locals>.<genexpr>z/Gather function not implemented for CPU tensorsr   Tc                 s   s    | ]}|  V  qd S r   r:   r   r   r   r   r   D   r"   c                 s   s    | ]	}|  d kV  qdS r   N)dimr   tr   r   r   r   E       r   c                 s   s    | ]}| d V  qdS )r   N)viewrC   r   r   r   r   F   s    zvWas asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.Fc                 3   s    | ]	}|  jV  qd S r   )sizerB   r   r-   r   r   r   O   rE   )r#   target_devicer   rB   r+   
input_gpuswarningswarnunsqueezed_scalarinput_sizesr   gather)r-   rI   rB   r.   r   rH   r   r0   9   s(   

zGather.forwardc                 C   s6   t | j| j| j|}| jrtdd |D }d| S )Nc                 s   s    | ]}|d  V  qdS rA   r   )r   gr   r   r   r   X   r"   z"Gather.backward.<locals>.<genexpr>r?   )Scatterr2   rJ   rN   rB   rM   r+   )r-   grad_outputZscattered_gradsr   r   r   r4   R   s   zGather.backwardNr5   r   r   r   r   r@   8   s
    
r@   c                   @   r   )rQ   c           
   	   C   s   dd |D }|| _ |jjdkr| nd| _d }tj r*| jdkr*dd |D }t	|||| j |}|d urjt
|D ]-\}}tj||  tj }	|	||  ||	 W d    n1 sdw   Y  q<|S )Nc                 S   r   r   r   r   r   r   r   r   _   r   z#Scatter.forward.<locals>.<listcomp>r   c                 S   s   g | ]
}t td |qS )cuda)_get_streamtorchr   )r   r   r   r   r   r   e   s    )rB   r   r   r'   r(   rV   rT   Zis_availabler   Zscatterr)   Zcurrent_streamZwait_streamZrecord_stream)
r-   r$   Zchunk_sizesrB   inputstreamsr/   r   r   Zmain_streamr   r   r   r0   ]   s$   
zScatter.forwardc                 G   s    d d d t j| j| jg|R  fS r   )r@   r2   r(   rB   )r-   rR   r   r   r   r4   r   s    zScatter.backwardNr5   r   r   r   r   rQ   \   s
    
rQ   _streamsr   c                 C   sh   | j dkrdS tt| j d}|du rdS tdu rdg|  at| j du r/|| jt| j< t| j S )zBGet a background stream for copying between CPU and target device.r   N)r   getattrrV   rY   Zdevice_countindexStream)r   Z
device_modr   r   r   rU   {   s   

rU   )rK   	itertoolsr   typingr   rV   Ztorch._utilsr   Ztorch.autogradr   Ztorch.nn.parallelr   r	   r1   r@   rQ   rY   listr\   __annotations__r   rU   r   r   r   r   <module>   s   
 $