o
    Zh4                     @  s   d dl mZ d dlZd dlmZmZ d dlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ dd	lmZmZmZ dd
lmZ erZd dlZd dlmZ ddlmZ ddlmZ G dd dejZG dd dZdS )    )annotationsN)AnyTYPE_CHECKING   )config)
write_text)get_metric_tableis_metric_table_enabled)DevicePropertiesReductionHint)BaseSchedulerNode	Scheduler	WhyNoFuse)V)
OrderedSet)SIMDKernelFeatures)TritonKernelc                   @  s   e Zd ZdZd	ddZdS )
Sortablez>Anything that can be used as a list.sort() key (int/tuple/etc)othertyping.Selfreturnboolc                 C  s   d S N )selfr   r   r   F/var/www/auris/lib/python3.10/site-packages/torch/_inductor/choices.py__lt__   s    zSortable.__lt__N)r   r   r   r   )__name__
__module____qualname____doc__r   r   r   r   r   r      s    r   c                   @  s   e Zd ZdZd-ddZed.ddZed/ddZed.ddZed0ddZ	ed1d#d$Z
ed1d%d&Zed1d'd(Zed2d*d+Zd,S )3InductorChoicesax  
    This class contains a collection of default heuristics that effect performance of our generated
    code.  We try to not put correctness requirements in this file.

    You can override the choices made here by doing:

            class MyHeuristics(InductorChoices):
                ...

            torch._inductor.virtualized.V.set_choices_handler(MyHeuristics())
    
kernel_clstype[TritonKernel]featuresr   groupslist[sympy.Expr]kernel_kwargsdict[str, Any]r   c                 C  s   |S )zTHook to change the kwargs passed to TritonKernel, used to apply fixed configurationsr   )r   r"   r$   r%   r'   r   r   r   triton_kernel_kwargs+      z$InductorChoices.triton_kernel_kwargsr   c                 C  sr   t jjrdS t jjrtj jdkrdS tjjj	| j
dd}|dkr'd| }n	|dkr.d	}ndS tjj| j|S )
z>Heuristic to decide if a cooperative reduction should be used.TcpuF   )fallback   i      i    )r   tritonZforce_cooperative_reductionsZcooperative_reductionsr   graphZget_current_device_or_throwtypesizevars	size_hintnumelstatically_known_geqreduction_numel)r$   Zxhint	thresholdr   r   r    should_use_cooperative_reduction5   s   
z0InductorChoices.should_use_cooperative_reductioncooperative_reductionc                 C  s~   t jjsdS tjdi|  d}|r.z|dttj	j
| jd 9 }W n	 ty-   Y nw t jjr6|d9 }tj	j
| j|S )zO
        Heuristic to decide if a persistent reduction should be used.
        Fi   @       r/   )r   r0   Zpersistent_reductionsr   INNERgetget_reduction_hintminr   r1   r3   r4   r5   
ValueErrorZmulti_kernelZstatically_known_leqr7   )r$   r:   r8   r   r   r   should_use_persistent_reductionL   s"   "z/InductorChoices.should_use_persistent_reductionc                 C  s    |   tjkotjj| jdS )a  
        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
        Strangely this is faster than a [1, RBLOCK] block in some cases.
           )r?   r   r=   r   r1   r3   r6   r7   )r$   r   r   r   want_no_x_dimj   s   zInductorChoices.want_no_x_dimdevicetorch.devicereduction_numel_hintint
numel_hintinner_reductionc                   sB  t | }|j}d}d d}|| | } | | }	d}
d|
 }|r|d| kr*dS |dkr0dS || |kr9|}n^|| |	k r{|| d|  }|| d | }|||  d ||  t|}t|fdd	d
}t| dk rxt||}n}nt|}t| fdd	d
}t|  dk r|}n }|||  d ||  S d}d}|| d | }|| |k r|}n]|| |	k r|| | }|| d | }|||  d ||  t|}t|fdd	d
}t| dk rt||}n }nt|}t| fdd	d
}t|  dk r|}n }|||  d ||  S )zHeuristic to decide the RSPLIT used for split reductions.
        When a reduction has a small number of outputs there is not enough parallelism,
        so we will do the reduction in two phases.r<   i   i   r.   r,   r   i    c                      t |   S r   absxtmp_split_sizer   r   <lambda>       z8InductorChoices.reduction_split_factor.<locals>.<lambda>)key   c                   rK   r   rL   rN   max_elements_per_threadr   r   rR      rS   2         c                   rK   r   rL   rN   rP   r   r   rR      rS      c                   rK   r   rL   rN   rV   r   r   rR      rS   )r
   createZmulti_processor_countsympydivisorsr@   rM   max)rE   rG   rI   rJ   propsZnum_smZmin_elements_per_threadZthreads_per_smZmin_elements_per_deviceZmax_elements_per_deviceZ	num_warpsZnum_threadsZ
split_sizeZtarget_blocksZblocks_per_outputr^   ZclosestZrvals_per_threadZxvals_per_blockZxblocksr   )rW   rQ   r   reduction_split_factorv   sv   





z&InductorChoices.reduction_split_factor	schedulerr   node1r   node2shared_data_scorec                   s   |dkrCt jr s rCtdr:j j @  t dkr:td fdd t	d dS t	d dS 
 sc
 sct t  t jkrct	d dS rrt	d	 dS d
S )a  
        Heuristics to prevent fusion applied to both horizontal and vertical fusions.  Heuristics here should not
        be needed for correctness and tweaking them may yield additional performance.

        See also some related heuristics that can be changed via config:
            - config.triton.tiling_prevents_pointwise_fusion
            - config.triton.tiling_prevents_reduction_fusion
            - config.aggressive_fusion (will cause this function to be called more times)
        r   Z'fusion_failure_due_to_indexing_mismatchc                     sD   t jjt jj  t t t  dS )N)Zpre_grad_graph_idpost_grad_graph_idZ
node1_nameZ
node2_nameZnode1_debug_strZnode2_debug_strZcommon_buffer_namesZfailure_reason)	r   r1   Zgraph_idrf   get_namer   Z	debug_strlistZdecide_fusion_fail_reasonr   Zcommon_buf_namesrc   rd   rb   r   r   rR      s   

z*InductorChoices.can_fuse.<locals>.<lambda>z'no shared data due to indexing mismatchFzno shared datazexceeds max fusionz Fusion will increase peak memoryT)r   Zaggressive_fusionis_reductionr	   Zread_writesZbuffer_nameslenr   add_rowr   Z
is_foreachZ	get_nodesZmax_fusion_sizeZcan_fusion_increase_peak_memoryrb   rc   rd   re   r   ri   r   can_fuse   s:   zInductorChoices.can_fusec                 C  s   dS )zCHook for heuristics to prevent vertical (producer/consumer) fusionsTr   rm   r   r   r   can_fuse_vertical  r*   z!InductorChoices.can_fuse_verticalc                 C  s>   |t jk rt||d dS | ||rt||d dS dS )zEHook for heuristics to prevent horizontal (consumer/consumer) fusionsscore_fusion_memory_thresholdFz=Nodes are too far away. Fusing them may increase peak memory.T)r   rp   r   Zare_long_distant_nodesrm   r   r   r   can_fuse_horizontal  s   
z#InductorChoices.can_fuse_horizontalr   c                 C  sx   |  ||}tt|j|j t|j|j  }| rd}nd| tjko*|dk }|| | ko8|dk||fS )a  
        Assign a score (higher comes first) to the fusion of node1 and node2.
        When different fusions conflict with each other, this is the way we
        decide what order to run them in.

        Our current score is based on:
        - The type of fusion (template/reduction/etc)
        - Estimate of the saved memory operations
        - Fusions closer together in original graph order
        r   r   )	Zscore_fusion_memoryr_   rM   Z	min_orderZ	max_orderZis_templater   Zepilogue_fusion_firstrj   )rb   rc   rd   Zmemory_scoreZproximity_scoreZtemplate_scorer   r   r   score_fusion"  s    zInductorChoices.score_fusionN)
r"   r#   r$   r   r%   r&   r'   r(   r   r(   )r$   r   r   r   )r$   r   r:   r   r   r   )
rE   rF   rG   rH   rI   rH   rJ   r   r   rH   )
rb   r   rc   r   rd   r   re   rH   r   r   )rb   r   rc   r   rd   r   r   r   )r   r   r   r    r)   staticmethodr9   rB   rD   ra   rn   ro   rq   rr   r   r   r   r   r!      s&    

U9	r!   ) 
__future__r   typingr   r   r]    r   Z	codecacher   Zmetricsr   r	   Zruntime.hintsr
   r   rb   r   r   r   Zvirtualizedr   ZtorchZtorch.utils._ordered_setr   Zcodegen.simd_kernel_featuresr   Zcodegen.tritonr   Protocolr   r!   r   r   r   r   <module>   s"    