o
    wZhq6                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ er2d dlmZ dd	 d
fdefddZe jedd	 ddZe jedd	 d
dZeG dd dZeG dd dZG dd dZG dd dZd#ddZdd	 d dfddZdd  Zd!d" ZdS )$    N)deque)	dataclass)TYPE_CHECKINGprofile)
DeviceType)_KinetoEventc                 C      | j S N)childrenx r   D/var/www/auris/lib/python3.10/site-packages/torch/profiler/_utils.py<lambda>       r   Freversec                 c   sX    |rt ndd }t|| }|r*||}|V  |||D ]}|| q|sd S d S )Nc                 S      | S r
   r   r   r   r   r   r          z_traverse.<locals>.<lambda>)reversedr   append)treenext_fnZchildren_fnr   order	remaining
curr_eventchild_eventr   r   r   	_traverse   s   r   c                 C      |   S r
   )popr   r   r   r   r          T)r   r   c                 C   r   r
   )popleftr   r   r   r   r      r    c                   @   sJ   e Zd ZU dZeed< dZeed< dZeed< dZeed< e	dd Z
dS )	EventMetricsr   duration_time_nsself_time_nsidle_time_nsqueue_depthc                 C   s   | j dkrdS | j| j  S )Nr   g        )r#   r%   selfr   r   r   fraction_idle_time(   s   
zEventMetrics.fraction_idle_timeN)__name__
__module____qualname__r#   int__annotations__r$   r%   r&   propertyr)   r   r   r   r   r"   !   s   
 r"   c                   @   s*   e Zd ZU eed< eed< dZeed< dS )Intervalstartendr   r&   N)r*   r+   r,   r-   r.   r&   r   r   r   r   r0   /   s   
 r0   c                   @   s>   e Zd Zdd Zdd Zdd Zdd Zd	ee fd
dZ	dS )EventKeyc                 C   s
   || _ d S r
   event)r(   r5   r   r   r   __init__7      
zEventKey.__init__c                 C   s   t | jjS r
   )hashr5   idr'   r   r   r   __hash__:   s   zEventKey.__hash__c                 C   s   | j j|j jkS r
   )r5   r9   )r(   otherr   r   r   __eq__=   s   zEventKey.__eq__c                 C   s
   | j j S r
   )r5   namer'   r   r   r   __repr__@   r7   zEventKey.__repr__	intervalsc           	      C   s   d}t |dd d}|r*t| jj|d j}t| jj|d j}||k r*||| 7 }d\}}|t|k rw|| }|| }|d7 }|j|jkrW|j|jkrQ|d7 }q.|j|_|}t| jj|j}t| jj|j}||k rq||| 7 }|t|k s4|S )Nr   c                 S   r	   r
   r1   r   r   r   r   r   E   r   z,EventKey.intervals_overlap.<locals>.<lambda>key)r      rC   )	sortedmaxr5   start_time_nsr1   minend_time_nsr2   len)	r(   r?   Zoverlap_timeZoverlap_startZoverlap_endijZprev_intervalZcurr_intervalr   r   r   intervals_overlapC   s0   zEventKey.intervals_overlapN)
r*   r+   r,   r6   r:   r<   r>   listr0   rL   r   r   r   r   r3   6   s    r3   c                   @   sN   e Zd ZdefddZdd Zdd Zdd	 Zd
d Zdde	de
fddZdS )BasicEvaluationprofc                 C   sd   || _ i | _|   tdd | j D dd d| _dd | jD | _g | _|  | _	| 
  d S )Nc                 s   s    | ]}|V  qd S r
   r   .0er   r   r   	<genexpr>j   s    z+BasicEvaluation.__init__.<locals>.<genexpr>c                 S   s   | j jS r
   )r5   rF   r   r   r   r   r   j   r    z*BasicEvaluation.__init__.<locals>.<lambda>rA   c                 S      g | ]}|j qS r   r4   rP   r   r   r   
<listcomp>l       z,BasicEvaluation.__init__.<locals>.<listcomp>)r   metricscompute_self_timerD   keysZ
event_keyseventscuda_eventscompute_queue_depthqueue_depth_listcompute_idle_time)r(   rO   r   r   r   r6   e   s   
zBasicEvaluation.__init__c                 C   s   | j jdusJ t| j j }|rS| }|j}|jD ]}||j8 }|| qt|| j	vs<J d|j
 d|j t|d| j	t|< |j| j	t| _|sdS dS )zM
        Computes event's self time(total time - time in child ops).
        NzDuplicate id: z, )r$   )r   kineto_resultsr   Zexperimental_event_treer   r#   r   r   r3   rW   r9   r=   r"   )r(   stackr   	self_timer   r   r   r   rX   q   s$   

z!BasicEvaluation.compute_self_timec                    s2  | j jdusJ | j j }dd dd tfdd|D dd	 d
}tfdd|D dd	 d
}t|| dd	 d
| _i }d}|D ] t| fdd	|d}|| < |dur\|n|}qEd}d}|| | j }	dd }
g }|	j|
d
 |	D ]}t|dr| d }| |	  d }||v r|| dur|| }t|dr|
 }|
 |  }||v r|| dur|| }nt|dr|j}|j}|t|k r|| 
 |kr|d7 }|t|k r|| 
 |ks|| d }t|d}t|dst|dr|t||| qxt|dr|| jt| _qx|S )z
        Computes queue_depth at each event. This will calculate the queue depth data for
        All the events in the tree.
        This will return a list of Interval of queue depth data of cuda launch and kernels.
        Nc                 S   s
   | j dkS )NZcudaLaunchKernel)r=   rR   r   r   r   is_cuda_launch_kernel   s   
zBBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernelc                 S   s   |   tjkod| j vS )NZmem)Zdevice_typer   CUDAr=   lowerrb   r   r   r   is_cuda_kernel   s   z;BasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernelc                 3       | ]	} |r|V  qd S r
   r   rP   )rc   r   r   rS          z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>c                 S   r   r
   start_nsr   r   r   r   r      r    z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>rA   c                 3   rg   r
   r   rP   )rf   r   r   rS      rh   c                 S   r   r
   ri   r   r   r   r   r      r    c                 S   r   r
   ri   r   r   r   r   r      r    r   c                    s   |      kS r
   )Zlinked_correlation_idr   )cuda_launch_eventr   r   r      s    r@   c                 S   s@   t | dr|  d S t | dr|  S t | dr| jS td)Nstart_us  rj   rF   zUnknown Event Type)hasattrrm   rj   rF   	Exceptionr4   r   r   r   new_old_event_comparator   s   


zEBasicEvaluation.compute_queue_depth.<locals>.new_old_event_comparatorrm   rn   rj   rF   rC   )r   r_   rZ   rD   r[   index_of_first_matchsortro   rm   Zduration_usrj   Zduration_nsrF   rH   rI   rE   r   r0   rW   r3   r&   )r(   Zcuda_event_listZcuda_launch_eventsZcuda_kernel_eventsZkernel_mappingZlast_mapped_kernelindexZcurrent_kernel_indexZspawned_kernel_indexZ
all_eventsrq   r]   r5   
start_timeend_timeZcurrent_queue_depthr   )rk   rf   rc   r   r\      s   
	






z#BasicEvaluation.compute_queue_depthc                 C   s   d}d}g }| j r(| jr(|t| jd j| j d jt| j d j| jd jg7 }| j D ] }|jdkr9|s9|j}d}|jdkrK|rK|t||j d}q+dd | j	
 D }|D ]}t||| j	t| _qXdS )z4
        Computes idle time of the profile.
        Fr   rl   Tc                 S   rT   r   r4   rP   r   r   r   rU      rV   z5BasicEvaluation.compute_idle_time.<locals>.<listcomp>N)r]   rZ   r0   rF   r1   r2   rH   r&   r   rW   rY   r3   rL   r%   )r(   idleZ
idle_startZidle_intervalsZ
data_point
event_listr5   r   r   r   r^      s0   
z!BasicEvaluation.compute_idle_timec                    s  ddl }ttj}dd |D }d d}g d}|t|k ru||  kr+|d7 }qt|d t|D ]6}t| fdd|d	}t|||d
}	|	durj||	 |krjt	||	 j
|| j
 |durf|n|} nq4|d7 }|t|k s fddj D }
|
r|jfdd|
D |jd}|jfdd|
D |jd}||| || }||| || }|d|  }dd tt||
tdddD }
|
d| }
|
S )a  
        Filter and Rank the events based on some heuristics:
        1) Events that are in the falling phase of the queue depth.
        2) Events that have a high idle_time, self_time difference.

        Parameters:
            length: The number of events to return.
        r   Nc                 S   rT   r   )r&   rP   r   r   r   rU     rV   z/BasicEvaluation.rank_events.<locals>.<listcomp>   rC   c                    s   |  kS r
   r   r   )bottom_threasholdr   r   r     r    z-BasicEvaluation.rank_events.<locals>.<lambda>r@   )r1   r2   c                    s   g | ]	}|  r|qS r   )rL   rQ   r5   )decrease_intervalr   r   rU   ,  s    c                       g | ]} j | jqS r   )rW   r$   r{   r'   r   r   rU   3      )Zdtypec                    r}   r   )rW   r)   r{   r'   r   r   rU   7  r~   g333333?c                 S   s   g | ]\}}|qS r   r   )rQ   _r5   r   r   r   rU   ?  s    T)rB   r   )torchrM   r   r]   rI   rangerr   argmaxr   r0   r1   rW   rY   ZtensorZfloat32meanZstdrD   zipoperator
itemgetter)r(   lengthr   r]   Z	qd_valuesZtop_threasholdrJ   rK   Znext_minimum_idxZpeak_idxrx   ra   Z	idle_timeZnormalized_gainZnormalized_selfZheuristic_score_listr   )rz   r|   r(   r   rank_events  sf   
zBasicEvaluation.rank_eventsrC   Tr   print_enablec                    sJ     |}|s	|S |rdnd}|d fdd|D 7 }|r#t| |S )NzOptimizable events:
zNo events to optimize

c                    s@   g | ]}d  d| dt |j d j| jd ddd  	qS )zP--------------------------------------------------------------------------------z
Event:                z
Source code location: z
Percentage idle time: d   z.2fz%
)source_code_locationr5   rW   r)   r{   r'   r   r   rU   Q  s    z:BasicEvaluation.get_optimizable_events.<locals>.<listcomp>)r   joinprint)r(   r   r   rx   outputr   r'   r   get_optimizable_eventsJ  s   


z&BasicEvaluation.get_optimizable_eventsN)rC   T)r*   r+   r,   r   r6   rX   r\   r^   r   r-   boolr   r   r   r   r   rN   d   s    ^IrN   c                 C   sD   |d u s
|t | krt | }t||D ]}|| | r|  S qd S r
   )rI   r   )seq	predicater1   r2   rJ   r   r   r   rr   _  s   rr   c                 C   r   r
   r   r   r   r   r   r   h  r   c                 C   s2   | || } t | dkrd S | t| |d| S )Nr   rA   )rI   rt   rE   )r   rB   r1   r2   r   r   r   r   h  s   r   c                 C   s0   | d urt d| j}|d u r| j} q | jS dS )Nz
\.py\(.*\)zNo source code location found)researchr=   parent)r5   matchr   r   r   r   o  s   r   c                  C   s8   ddl m}  |  	 W d    d S 1 sw   Y  d S )Nr   r   )torch.autograd.profilerr   r   r   r   r   _init_for_cuda_graphs}  s   "r   )r   N)	functoolsr   r   collectionsr   dataclassesr   typingr   r   r   Ztorch.profilerr   Ztorch.autogradr   r   r   partialZtraverse_dfsZtraverse_bfsr"   r0   r3   rN   rr   r   r   r   r   r   r   r   <module>   s4   

. 
|	