o
    Zh                    @  s*  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dl m!Z! d dlm"Z" d dl#m$Z$ d d	l%m%Z%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4Z4d dl5m6Z7 d dl4m8Z8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF d dlGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ d dlRmSZS d dlTmUZUmVZV d dlWmXZXmYZY d dlZm[Z[m\Z\ d dl]m^Z^m_Z_ d dl`maZambZbmcZcmdZd d dlemfZf d dlgmhZhmiZimjZj d dlkmlZl d dlmm?Zn d dlompZpmqZq d dlrmsZsmtZtmuZu d dlvmwZw d d!lxmyZy d d"lzm{Z{ d d#l|m}Z} d d$l~mZ d d%lmZ e? rd d&lmZ d d'lmZmZmZmZ ndd-d.Zdd/d0Zdd1d2Zdd4d5Ze/rd d6lmZmZmZ d d7lmZ d d8lmZmZ d d9lmZ d d:lmZ d d;lmZmZ d d<lzmZmZ d d=lmZmZ d d>lmZ d d?lmZ e0d@ZejdAkZdBZe4jedCZeeZddEdFZedddIdJZG dKdL dLZG dMdN dNeZG dOdP dPeZddQdRZddUdVZddd[d\Z	WdddadbZ	XdddedfZ	W	X	WdddhdiZddkdlZ	m	mdddqdrZejG dsdt dtZddvdwZdd{d|ZG d}d~ d~ejZdddZeddddZdddZejG dd dZG dd deŃZG dd dZdddZdddZG dd dZedd ddZebG dd dZG dd dZebejdddZdddZdaded< dddZebG dd dZӐdddZebG dd deӃZebG dd deՃZebG dd deՃZאdddÄZؐdddƄZebG ddȄ dȃZڐddd̈́ZېdddЄZܐdddӄZݐdddՄZސdddׄZߐdddلZ	dd	ddZG dd dZebG dd dZebG dd dZG dd dZG dd deZdS (
      )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)partial)Path)timetime_ns)
ModuleType)AnyCallablecastNoReturnOptionalTYPE_CHECKINGTypeVarUnion)Self)SymIntTensor)CompileEventLoggercountersdynamo_timed)configexcmetrics)cuda_env)rocm_compile_commandrocm_compiler)
_LINKER_SCRIPT_set_gpu_runtime_env_TORCH_PATH_transform_cuda_paths
CppBuilder
CppOptionsCppTorchDeviceOptionsget_compiler_version_info&get_name_and_dir_from_output_file_pathnormalize_path_separator)pick_vec_isa)CustomGraphPassCustomGraphPassType)has_frozen_paramsis_frozen_param)_reload_python_module _reload_python_module_in_subproc)	cache_dirdefault_cache_dir)ALIGN_BYTESclear_on_fresh_inductor_cacheis_linux
is_windows)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)log_cache_bypass)r   )CacheArtifactManagerCacheArtifactType)has_hinthint_intShapeEnv)
OrderedSet   )CUSTOM_OBJ_FILENAME_PREFIX)create_cache)autotune_cache)AutotuneCacheBundler)TritonBundler)build_paths)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cacheargsr   kwargsreturnNonec                  O     d S N rO   rP   rU   rU   H/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codecache.pyrK   p      rK   c                  O  rS   rT   rU   rV   rU   rU   rW   rL   s   rX   rL   c                  O  rS   rT   rU   rV   rU   rU   rW   rM   v   rX   rM   boolc                   C     dS NFrU   rU   rU   rU   rW   rN   y   rX   rN   )	GeneratorKeysViewSequence)Future)_CompileFxKwargsCompiledFxGraphGraphLowering)ChoiceCaller)CompiledFxGraphConstants
OutputCode)
JsonDataTyRemoteCache)HalideInputSpec
HalideMeta)CachingAutotuner)	InputTypeTwin32iX  output_codestrc                   C  s   t jjd u rdS dS )NZ
cubin_pathZ
hsaco_path)torchversionhiprU   rU   rU   rW   get_cpp_wrapper_cubin_path_name   s   rt   global_cache_dirOptional[Path]c                 C  s&   | d urt tj| t d S d S )Nhash)r
   ospathjoin	CacheBase
get_system)ru   rU   rU   rW   get_global_cache_path_impl   s
   r}   c                   @  sl   e Zd ZeeddddZeeeddddZedd
dZ	dddZ
dddZdddZdS )r{   NrQ   dict[str, Any]c               	   C  s   zddl m}  |  }W n ty   d }Y nw z9dd id|id}tjtj }tjjd ur@|j|d d< tjj|d d< n|j	|d d< tjj
|d d	< W n ttfy]   i }Y nw ttj|d
dd |d< |S )Nr   )
triton_keynameZtriton)devicerr   r   rr   cudars   T)	sort_keysutf-8rw   )Ztriton.compiler.compilerr   ModuleNotFoundErrorrq   r   Zget_device_propertiesZcurrent_devicerr   r   ZgcnArchNamers   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   Ztriton_versionsystemZdevice_propertiesrU   rU   rW   r|      s:   
zCacheBase.get_systemr
   c                   C  s   t tjt dt d S )Ncacherw   )r
   rx   ry   rz   r3   r{   r|   rU   rU   rU   rW   get_local_cache_path   s   zCacheBase.get_local_cache_pathrv   c                   C  s
   t tjS rT   )r}   r   ru   rU   rU   rU   rW   get_global_cache_path      
zCacheBase.get_global_cache_pathrR   c                 C  s   t  | _d S rT   )r{   r|   r   selfrU   rU   rW   __init__   s   zCacheBase.__init__c                 C  sT   |   }| s
i S t|}t|}W d    |d S 1 s!w   Y  |d S Nr   )r   is_fileopenr   load)r   local_cache_pathZlocal_cache_fplocal_cacherU   rU   rW   get_local_cache   s   

zCacheBase.get_local_cacher   c                 C  s0   |   }tt|tj| j|ddddd d S )N)r   r      )indentT	make_dirs)r   write_atomicrp   r   r   r   )r   r   r   rU   rU   rW   update_local_cache   s   
zCacheBase.update_local_cacherQ   r~   )rQ   r
   )rQ   rv   rQ   rR   )r   r~   rQ   rR   )__name__
__module____qualname__staticmethod	functools	lru_cacher|   r6   r   r   r   r   r   rU   rU   rU   rW   r{      s    $

r{   c                   @  s    e Zd ZdddZdd
dZdS )
LocalCachekeysrp   rQ   Optional[dict[str, Any]]c                 G  s0   |   }|}|D ]}||v r|| }q d S |S rT   )r   )r   r   r   	sub_cachekeyrU   rU   rW   lookup   s   
zLocalCache.lookupvaluer   rR   c                G  sL   |   }|}|dd D ]}||i  || }q|||d < | | d S )Nr   )r   
setdefaultr   )r   r   r   r   r   r   rU   rU   rW   	set_value   s   
zLocalCache.set_valueN)r   rp   rQ   r   )r   rp   r   r   rQ   rR   )r   r   r   r   r   rU   rU   rU   rW   r      s    
r   c                   @  s*   e Zd ZeddddZdddZdS )PersistentCacheNrQ   r~   c                 C  s\   |   }|d u s| si S t|}t|}W d    |d S 1 s%w   Y  |d S r   )r   r   r   r   r   )r   Zglobal_cache_pathZglobal_cache_fpZglobal_cacherU   rU   rW   get_global_cache   s   

z PersistentCache.get_global_cachechoiceslist[ChoiceCaller]oprp   inputs	benchmark4Optional[Callable[[Any], dict[ChoiceCaller, float]]]dict[ChoiceCaller, float]c              
     sz  t  tt| j}tt| j}tt| j}i dd fdd	}tjs3tj	rtj
r:|  ni }	||	st rK||  |d
s|durz:| tfdd D saJ |	i  |	 i i   D ]\}
}||	   |
 < qwW n ty } z|| |d}~ww | |	 fdd D }|| S t r||  |d
 S )aG  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            3. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nr   r~   callbackr   rQ   rY   c                   sf   d} D ]%}|  }|| i i i v r'|    | |< qd} |r1||d |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)r   r   hitchoiceZchoice_hashr   r   r   	precisiontimingsrU   rW   check_cache$  s    
z+PersistentCache.lookup.<locals>.check_cache)r   c                 3  s    | ]}| v V  qd S rT   rU   .0r   r   rU   rW   	<genexpr>B      z)PersistentCache.lookup.<locals>.<genexpr>c                   s   i | ]	}|   | qS rU   )r   r   r   rU   rW   
<dictcomp>N  s    z*PersistentCache.lookup.<locals>.<dictcomp>rT   )r   r~   r   r   rQ   rY   )rq   Zget_float32_matmul_precisionr	   rL   r   rM   rK   r   Zmax_autotuneZmax_autotune_gemmZautotune_local_cacher   rN   r   allr   itemsr   r   r   )r   r   r   r   r   Z	log_statsZlog_valsZ
log_errorsr   r   r   ZtimingeZtimings_to_logrU   r   rW   r   	  sN   

zPersistentCache.lookupr   )
r   r   r   rp   r   rp   r   r   rQ   r   )r   r   r   r   r   r   r   rU   rU   rU   rW   r      s    r   c                  C  s.   t jt d} t j| st j| dd | S )NlocksTexist_ok)rx   ry   rz   r3   existsmakedirs)lock_dirrU   rU   rW   get_lock_dirZ  s   r   databytesc                 C  s&   t t|  d d d S )N3   r   )base64	b32encoder   r   digestdecodelower)r   rU   rU   rW   sha256_hasha  s   &r    codeUnion[str, bytes]extrac                 C  sL   t | tr| n| d}|r t |tr|n|d}|d | }dt| S )Nr   s   ||c)
isinstancer   r   r   )r   r   Zhashing_strZextra_brU   rU   rW   	code_hashf  s
   r   basename	extensionspecified_dirtuple[str, str, str]c                 C  sb   |rt j|r|}nt jt |}nt jt | dd }t j||  d| }| ||fS )NrD      .)rx   ry   isabsrz   r3   )r   r   r   subdirry   rU   rU   rW   get_pathn  s   
r   content	hash_typec                 C  s4   |dkr	t | |S |dv rt t| S td| )Nr   )cubinZhsacoZspvzUnknown hash type )r   reprr   )r   r   r   rU   rU   rW   get_hash|  s
   
r   tuple[str, str]c           	      C  sD   t |  ||}t|||\}}}tj|st|| dd ||fS )NTr   )r   stripr   rx   ry   r   r   )	r   r   r   r   r   r   r   Z_subdirry   rU   rU   rW   write  s
   
r   textc                 C  s   t | dd S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtrD   r   )r   rU   rU   rW   
write_text  s   r   Fpath_r   encode_utf_8c                 C  s   t |ttfsJ dt| }|r|jjddd |jdt  dt	  d }t |tr0dnd}|j
||r9dnd d	}|| W d    n1 sMw   Y  z	|j|d
 W d S  tyt   tse tj||d t| Y d S w )Nz6Only strings and byte arrays can be saved in the cacheT)parentsr   r   z.tmpwwbr   )encoding)target)srcdst)r   rp   r   r
   parentmkdirrx   getpid	threading	get_identr   r   renameFileExistsError_IS_WINDOWSshutilcopy2remove)r   r   r   r   ry   Ztmp_pathZ
write_modefrU   rU   rW   r     s&    r   c                   @  s"   e Zd ZU dZded< ded< dS )TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    r<   Ztensor_metadata	list[Any]valuesNr   r   r   __doc____annotations__rU   rU   rU   rW   r    s   
 r  xc                 C     | S rT   rU   r  rU   rU   rW   _ident  rX   r  tr   r<   c                 C  s&   t | }t| dstj|ddd}|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    Z_is_inductor_staticr   N)Zstorage_offsetZstorage_bytes)r:   hasattrdataclassesreplace)r  metarU   rU   rW   %extract_tensor_metadata_for_cache_key  s   
r!  c                      sv   e Zd ZdZ	d+d, fd	d
Zd-ddZd.ddZd/ddZd0ddZd1ddZ	d2d!d"Z
d3d$d%Zd4d)d*Z  ZS )5FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    Fgmtorch.fx.GraphModulehas_user_defined_triton_kernelsrY   rQ   rR   c                   s   t  | _t | j tj | _| jt	t
| jtjt
| jtjjjt
| jtjt
| jtjjjjt
| ji |rKt
| j| j|j< d| _dS )a2  
        Create an FX graph pickler. If include_non_inlined=True, then pickling will
        include the _values_ for all Tensors. (Note that any tensors are constants
        attached as attributes to the GraphModule). Otherwise, pickling will include
        only the metadata for these tensors.
        TN)ioBytesIO_streamsuperr   copyregdispatch_tabler   updater;   r   r	   _reduce_fake_tensorrq   r   _reduce_tensornnZ	parameter	Parameterr   _reduce_symintfxZexperimentalZ_backward_stateZBackwardState_reduce_unsupported_reduce_graph_module	__class__fast)r   r#  r%  r5  rU   rW   r     s$   

zFxGraphCachePickler.__init__r  r   .tuple[Callable[[T], T], tuple[TensorMetadata]]c                 C  s   t |}t|ffS )z7
        Custom reducer to pickle FakeTensors.
        )r!  r  )r   r  metadatarU   rU   rW   r-    s   
z'FxGraphCachePickler._reduce_fake_tensorNtuple[Callable[[T], T], tuple[Union[TensorMetadata, TensorMetadataAndValues]]]c                 C  s   ddl m} |jrtdt|}t|r||st|ffS t }|	 }t | }|dkr9t
d|dd tt||ffS )z
        Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
        stored as attributes on the GraphModule.
        rD   rb   zmkldnn tensors unpickleableg      ?z0FX graph cache copying of a large constant took z.1zs. Please file an issue.)graphrc   	is_mkldnnBypassFxGraphCacher!  r0   Zcan_inline_constantr  r   tolistwarningswarnr  )r   r  rc   r9  startr  elapsedrU   rU   rW   r.    s   

z"FxGraphCachePickler._reduce_tensorsr   #tuple[Callable[[T], T], tuple[str]]c                 C  s   t t|ffS )z3
        Custom reducer to pickle SymInts.
        )r  rp   r   rC  rU   rU   rW   r1  /  s   z"FxGraphCachePickler._reduce_symintr   r   c                 C  s   t d)z{
        Custom reducer to handle any objects that we don't support and therefore
        raise to bypass caching.
        zReduce unsupported)r=  rE  rU   rU   rW   r3  8  s   z'FxGraphCachePickler._reduce_unsupported&tuple[Any, tuple[dict[str, Any], str]]c                 C  sH   |  \}\}}|d }tdd|}tdd|}||d< |||ffS )a  
        Custom reducer for graph module to handle irrelevant data for user
        defined triton kernels
        Essentially what we are doing here is a huge hack where user defined
        triton kernel contain a dynamo time side table and the arguments to the
        call_function are indicies into this side table. These arguments are not
        for hashing purposes since we included the source code into the cache
        key and the numbers are prone to give false negatives due to ordering.
        _codezkernel_idx = \d+r   zconstant_args_idx = \d+)
__reduce__resub)r   r#  fnr   Zimportsr   rU   rU   rW   r4  ?  s   z(FxGraphCachePickler._reduce_graph_moduleobjr   c              
   C  s   z3z|  | | j W W | jd | jd S  ttfy3 } ztjddd t	d|d}~ww | jd | jd w )z<
        Pickle an object and return a byte string.
        r   zFailed to pickle cache keyTexc_infoN)
dumpr(  getvalueseektruncate	TypeErrorAttributeErrorlogwarningr=  )r   rL  r   rU   rU   rW   r   R  s   

zFxGraphCachePickler.dumpsrp   c                 C  s   |  |}t|S )zE
        Serialize an object and return a hash of the bytes.
        )r   r   )r   rL  Zserialized_datarU   rU   rW   r   b  s   
zFxGraphCachePickler.get_hashinpFxGraphHashDetails	list[str]c           
        s   d fdd}g }t | D ]k\}}t|tr>tt|D ]} || }|d| d| d| d	|||   qqt|tre| D ]\}}	 |	}|d| d| d| d	||	  qGq |}|d| d| d
||  q|S )z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        rL  r   rQ   rp   c                   sV   t | tjrtt| S t | trdS t|  jv r't jt|  | d S t| S )Nz<bytes>rD   )r   rq   r   rp   r!  r   typer+  )rL  r   rU   rW   get_strp  s   
z0FxGraphCachePickler.debug_lines.<locals>.get_str[z] z]: z: NrL  r   rQ   rp   )	varsr   r   listrangelenr   appenddict)
r   rW  r[  linesattrrL  iihkvrU   r   rW   debug_linesi  s    
,

(
"zFxGraphCachePickler.debug_linesF)r#  r$  r%  rY   rQ   rR   )r  r   rQ   r8  )r  r   rQ   r:  )rC  r   rQ   rD  )rC  r   rQ   r   )r#  r$  rQ   rF  )rL  r   rQ   r   r]  )rW  rX  rQ   rY  )r   r   r   r  r   r-  r.  r1  r3  r4  r   r   rj  __classcell__rU   rU   r7  rW   r"    s    

$
	
"
	


r"  rootslist[str] | Noneprefixhasherhashlib._Hashc              	   C  s   t t| |dd dD ]L}|j|jd }|d usJ |j}|d us%J t|d}||j	d ||
  W d    n1 sEw   Y  |jrXt|j|j d| qd S )Nc                 S  s   | j S rT   )r   r  rU   rU   rW   <lambda>  s    z!build_code_hash.<locals>.<lambda>r   rbr   r   )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   r,  r   readispkgbuild_code_hashsubmodule_search_locations)rm  ro  rp  libspecmoduler  rU   rU   rW   r}    s   r}  c                  C  sx   t ddd, t sddd	} | tW  d
   S ddlm} |d dW  d
   S 1 s5w   Y  d
S )zS
    Compute a key that contains relevant information about torch source files
    Zinductor_codecache_torch_keyT)Zlog_pt2_compile_eventrootrp   rQ   r   c              	     s   d}t jt  fdd|D }t }|tj	d t
| gd| |D ]$}t j|rKt|d}||  W d    n1 sFw   Y  q'| S )N)z"codegen/aoti_runtime/interface.cppcodegen/cpp_prefix.hz	script.ldc                   s   g | ]	}t j |qS rU   )rx   ry   rz   r   r  Zinductor_rootrU   rW   
<listcomp>  s    z4torch_key.<locals>.get_code_hash.<locals>.<listcomp>r   r   rt  )rx   ry   dirname__file__r   r   r,  rq   __version__r   r}  r   r   r{  r   )r  Zextra_filesrp  ry   r  rU   r  rW   get_code_hash  s   z torch_key.<locals>.get_code_hashNr   parutilztorch/src_hash.txtascii)r  rp   rQ   r   )	r   r   	is_fbcoder$   libfb.pyr  Zget_file_contentsrstripr   )r  r  rU   rU   rW   	torch_key  s   
$r  c                   C  s   t jtS rT   )rx   ry   r  r  rU   rU   rU   rW   get_inductor_root     r  c                   @  s   e Zd ZU dZded< dS )OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    r  r   Nr  rU   rU   rU   rW   r    s   
 r  c                   @  s   e Zd ZdZdS )r=  zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   r  rU   rU   rU   rW   r=    s    r=  c                   @  s*   e Zd ZdZdgZdddZdddZdS )rX  zz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    Zgraph_idr#  r$  example_inputsSequence[InputType]	fx_kwargsr`   inputs_to_checkSequence[int]rQ   rR   c                 C  s  || _ || _tj| _i | _t| D ] \}}|| jvr3t|t	t
fv r.tt|| j|< q|| j|< qddlm}m}m}	 ddlm}
 g | _|d ur| D ][}t|tjjsYqOt|jjd|d|jjd|	dD ]>}ddlm} ||jd }d }t||r|j rt!tdd	 |j D }|j"}|
|}|#|jd
 }| j$|||f qkqO|| _%t&dd	 |D  }|rtj'( rtj') | _*t+ t, tj-j.j/f| _0tj1j2j3j4tj1j2j3j5tj1j2j3j6f| _7t8 | _9t:; | _<t=> | _?| @t=jA| _A| @t=jB| _Bd S )Nr   )kernel_side_table triton_kernel_wrapper_functionaltriton_kernel_wrapper_mutation)9user_defined_triton_kernel_transitive_closure_source_codeZcall_function)r   r  )	AutotunerZ
kernel_idxc                 s  s*    | ]}t d d |  D V  qdS )c                 s  s    | ]}t |V  qd S rT   )rp   )r   kvrU   rU   rW   r     r   z8FxGraphHashDetails.__init__.<locals>.<genexpr>.<genexpr>N)ru  Z
all_kwargsr   )r   r   rU   rU   rW   r     s
    
z.FxGraphHashDetails.__init__.<locals>.<genexpr>Zconstant_args_idxc                 s  s    | ]	}t |tjV  qd S rT   )r   rq   r   r  rU   rU   rW   r   (  s    )Cr#  r  cconfigZcache_key_tagr  ru  r   EXCLUDED_KWARGSrZ  setrC   r  Z*torch._higher_order_ops.triton_kernel_wrapr  r  r  Ztorch._inductor.codegen.wrapperr  user_defined_triton_sourcemodulesr   rq   r2  GraphModule	itertoolschainr;  Z
find_nodesZtriton.runtime.autotunerr  Z
get_kernelrP   configsrp   rK  Zget_constant_argsrb  r  anyZacceleratoris_availableZcurrent_device_indexZdefault_cuda_device_indexZ$are_deterministic_algorithms_enabledZ-is_deterministic_algorithms_warn_only_enabledutilsZdeterministicZfill_uninitialized_memoryZ!deterministic_algorithms_settingsbackendsr   matmulZ
allow_tf32Z&allow_fp16_reduced_precision_reductionZ&allow_bf16_reduced_precision_reductionZcuda_matmul_settingsr  Ztorch_versionr{   r|   Zsystem_infor   Zsave_config_portableZinductor_config_get_custom_pass_detailpost_grad_custom_pre_passpost_grad_custom_post_pass)r   r#  r  r  r  rh  ri  r  r  r  r  r  noder  kernelr  Zkernel_sourceZconstant_argsZno_tensor_inputsrU   rU   rW   r     s   


#





zFxGraphHashDetails.__init__custom_passr.   Optional[Any]c                 C  s   |sd S t |tsJ | S rT   )r   r-   uuid)r   r  rU   rU   rW   r  I  s   z*FxGraphHashDetails._get_custom_pass_detailN)
r#  r$  r  r  r  r`   r  r  rQ   rR   )r  r.   rQ   r  )r   r   r   r  r  r   r  rU   rU   rU   rW   rX    s
    
nrX  r#  r$  r  r  r  r`   r  r  tuple[str, list[str]]c           
      C  sf   t | |||}t|jdk}t| |}d|| }||}d|}	td| d|	  ||fS )z=
    Generate a unique hash of the FX graph for caching.
    r   r  
z$FX graph cache hash details for key z:
)	rX  ra  r  r"  r   rj  rz   rU  debug)
r#  r  r  r  detailsr%  Zpicklerr   rj  Z	debug_strrU   rU   rW   compiled_fx_graph_hashR  s   	


r  time_saved_nsintc                 C  s|   t j r
t j sdS t| d }t r-t jd}t	
d|| |t|| d 7 }t	
d| tjt|d |S )z}
    Ephemerally increases the NCCL timeout when compiling for a distributed job
    Returns amount of seconds increased
    r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)rq   distributedr  Zis_initializedr  r   r  _utils_internalZjustknobs_getval_intrU  infodistZdistributed_c10dZ"_add_ephemeral_timeout_for_all_pgsr   )r  Zincreased_timeout_secZfudge_factorrU   rU   rW   .add_ephemeral_timeout_increase_for_distributedh  s$   r  c                   @  s   e Zd ZdZed<ddZed=ddZed>ddZed?ddZed@ddZ	edAddZ
edBd"d#ZedCd&d'ZedCd(d)ZedDd0d1ZedEd2d3ZedFd7d8ZedGd9d:Zd;S )HFxGraphCachea7  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metatdata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    rQ   rp   c                   C  s   t jt dS )zS
        Get the toplevel temporary directory for storing compiled graphs.
        Zfxgraph)rx   ry   rz   r3   rU   rU   rU   rW   _get_tmp_dir  s   zFxGraphCache._get_tmp_dirr   c                 C  s   t jt | dd | S )zA
        Return the disk location for a given cache key.
        rD   r   )rx   ry   rz   r  r  rs  rU   rU   rW   _get_tmp_dir_for_key  s   z!FxGraphCache._get_tmp_dir_for_keyr   r  list[torch.SymInt]c                 C  s   dd | D S )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        c                 S  s$   g | ]}t |tjrt|r|qS rU   )r   rq   r   r@   r   rC  rU   rU   rW   r    s   $ z7FxGraphCache._filter_backed_symints.<locals>.<listcomp>rU   )r   rU   rU   rW   _filter_backed_symints  s   z#FxGraphCache._filter_backed_symintsOptional[ShapeEnv]c                  C  s   t jj } | s
dS | jjS )zG
        Helper to get the shape env from the tracing context.
        N)rq   Z_guardsZTracingContextZtry_getZ	fake_mode	shape_env)ctxrU   rU   rW   _get_shape_env  s   zFxGraphCache._get_shape_envr  localrY   remote_cache!Optional[RemoteCache[JsonDataTy]]	constantsre   0tuple[Optional[CompiledFxGraph], dict[str, Any]]c                   s  t  }|dus
J t |}dd |D }dfdd}d}	d}
t }| D ]$\}}
|js4|}	 nt||j|}td|j|| |rM|}	 nq)|	du rVd|fS |
durbt	
tj|
 |	j }rt|}| }durt||d	< tjd
|jd t|jdkrtd z|	| ddlm} |jdur||	j W n ty   d|f Y S w t }|	jtj|d |	jrt||	j|}|du sJ td|j  t!j"#|	j$ t%d  |	j&7  < t'd t'd  t(d fddfddd |	|fS )z
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        Nc                 S  s   g | ]}t |qS rU   )rA   r  rU   rU   rW   r        z.FxGraphCache._lookup_graph.<locals>.<listcomp>rQ   4Generator[tuple[CompiledFxGraph, bytes], None, None]c               	   3  s,   rPt  } tj| rPtt| D ]:}z(ttj| |d}|	 }t
||fV  W d    n1 s9w   Y  W q tyO   tjddd Y qw rz0  }d urt|tscJ |d }t|ttfspJ t|}t
||fV  W d S W d S  ty   tjddd Y d S w d S )Nrt  z,fx graph cache unable to load compiled graphTrM  r   )r  r  rx   ry   r   ru  listdirr   rz   r{  pickleloads	ExceptionrU  rV  r   r   rc  rp   r   r   	b64decode)r   ry   r  r   
cache_datar   )r   r  r  rU   rW   iterate_over_candidates  s@   


z;FxGraphCache._lookup_graph.<locals>.iterate_over_candidateszEfx graph cache key %s evaluating guards [%s] with values %s => hit=%striton_bundler_metaZinductor_compile)cached_kernel_namesr   Znum_triton_bundlesrD   rb   r   Tz*fx graph cache key %s post-load guards: %sinductorzOutput code: 
%szOutput code written to: %sZinductor_output_codec                     s   d iS )NfilenamerU   rU   )artifact_pathrU   rW   rr  J  s    z,FxGraphCache._lookup_graph.<locals>.<lambda>c                         S rT   rU   rU   r  rU   rW   rr  K      Z
payload_fn)rQ   r  ))r  r  r  rc  guards_exprrY   Zevaluate_guards_expressionrU  r  r>   record_artifactr?   INDUCTORZ_triton_bundlerI   Zread_and_emitrp   r   Ztry_add_pt2_compiler  ra  increment_toplevelZafter_deserializationr;  rc   Zsave_output_codesource_codeOSErrorrG   Zinductor_meta_from_configrH   Zbegin_compileguardsr   ZCachedMetricsHelperZapply_deltasZmetrics_deltasr   Zcounter_deltasoutput_code_logr9   )r   r  r  r  r  r  symintshintsr  r;  Zpickled_content
cache_info	candidater   Zbundler  r   rc   Zinductor_metacheckrU   )r  r   r   r  r  rW   _lookup_graph  s   
 






zFxGraphCache._lookup_graphr   r   rR   c                 C  sH   t | }tj|stj|dd tj|t|}t||dd d S )NTr   r   )	r  r  rx   ry   r   r   rz   r   r   )r   r   r   ry   rU   rU   rW   _write_to_local_cacheO  s
   
z"FxGraphCache._write_to_local_cachecompiled_graphrf   c                 C  sT  ddl m} t||sJ dt| dt|}|  t }|dus'J t|}|	|}	|j
||	d|_zt|}
W n ty[   tjddd	 td
 d  d7  < Y dS w z2ttj| |
 |rmt| |
 |rt|jptdd }t|
d|d}|| | W dS W dS  ty   tjddd	 td
 d  d7  < Y dS w )z=
        Store a serialized CompiledFxGraph on disk.
        rD   )ra   zserialization for z NYIN)Zplaceholdersr  z1fx graph cache unable to serialize compiled graphTrM  r  Zfxgraph_cache_pickle_errorr   g    .Ar  )r   time_taken_msz!fx graph unable to write to cacheZfxgraph_cache_write_error)
compile_fxra   r   rZ  r   Zprepare_for_serializationr  r  r  Zget_pruned_guardsZproduce_guards_expressionr  r  r   r  rU  rV  r   r>   r  r?   r  r  r  _time_taken_nsr   	b64encoder   put)r   r  r  r  r  ra   Zdisk_compiled_graphr  r  r  r   r  r  rU   rU   rW   _save_graph[  sN   

zFxGraphCache._save_graphr#  r$  c                 C  s   |   D ]<}t|tjjsq|jjD ]-}t|jtjj	r+|j
 s+td|j  |jdkr?tt| |jtjjr?tdqqd S )Nz!Can't cache HigherOrderOperator: getattrzCan't cache torchbind objects)r  r   rq   r2  r  r;  nodesr  Z_opsZHigherOrderOperator	cacheabler=  r   r   r  _CScriptObject)r#  r  r  rU   rU   rW   _check_for_hop  s$   zFxGraphCache._check_for_hopc                 C  s   t jt jfD ]}|rt|tr| stdqt| r&tj	
ds&tdt jjr.tdddlm} |jr>td tt du rMtd	 td
t|  dS )z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        z!Unsupported post grad custom passz,pytorch/inductor:allow_freezing_with_cachingz$Skipping graph with frozen constantszORuntime constant folding can introduce constants that aren't static across runsr   )CompilerBisectorz$dont cache graph when bisect enabledNzfx graph cache no shape envzNo shape env)r   r  r  r   r-   r  r=  r/   rq   r  Zjustknobs_checkaot_inductorZuse_runtime_constant_foldingZ!torch._inductor.compiler_bisectorr  Zbisection_enabledrU  r  r  r  r   )r#  pr  rU   rU   rW   _check_can_cache  s(   

zFxGraphCache._check_can_cacher  r`   r  r  remote6tuple[Optional[tuple[str, list[str]]], dict[str, Any]]c           	   
   C  s   zt |  t| |||\}}W n8 tyH } z,td d  d7  < td| |r1tdt| dt|t	 d}d|fW  Y d}~S d}~ww ||fi fS )	a  
        Checks that the inductor input is cacheable, then computes
        and returns the cache key for the input.
        Returns (key_info, cache_info) where:
        - key_info is (hash_key, debug_lines), and
        - cache_info will contain debug info in the event of BypassFxGraphCache.

        NB: It is possible to have this function return a union instead. But
        I personally believe it is more annoying/difficult to read in that format.
        r  Zfxgraph_cache_bypassrD   z%Bypassing FX Graph Cache because '%s'Zbypass_fx_graphbypass)cache_stateZcache_bypass_reasoncache_event_timeN)
r  r  r  r=  r   rU  r  r=   rp   r   )	r#  r  r  r  r  r   rj  r   r  rU   rU   rW   prepare_key  s$   
zFxGraphCache.prepare_keyc                  C  s   d} t | t ddS )zK
        Attempts to load the remote cache, returns None on error.
        zfx-graph-v1ZFbRemoteFxGraphCacheZRemoteFxGraphCache)rF   r   r  )Zcache_idrU   rU   rW   get_remote_cache  s   zFxGraphCache.get_remote_cacherj  rY  is_backwardc                 C  s  t | ||||\}}i || |t d}|dur^td|  td d  d7  < d|d< |r;td	 td
|  |j	 }	durZ|	|d< td|	d  t
|	 }
dkrZ|
|d< ||fS |rktd td|  td|  td d  d7  < d|d< ||fS )z
        Lookup the graph with the given key, and return results and metadata.
        Doesn't do any logging on its own, because AOTAutograd handles a cache miss
        differently from FXGraphCache.
        )r   
componentsr	  Nzfx graph cache hit for key %sr  Zfxgraph_cache_hitrD   r   r  Z"inductor_fx_remote_cache_hit_countZ!inductor_fx_remote_cache_hit_keysr  Z distributed_ephemeral_timeout_usi  r   Zephemeral_timeout_increaseZ#inductor_fx_remote_cache_miss_countZ"inductor_fx_remote_cache_miss_keyszfx graph cache miss for key %sZfxgraph_cache_missZmiss)r  r  r   rU  r  r   r   r  Zadd_to_set_toplevelr  r  )r   rj  r  r  r  r  r  r  r  r  Zephemeral_increaserU   rU   rW   load_with_key  sV   
zFxGraphCache.load_with_keyc                   C  s*   z
t t  W dS  ty   Y dS w )z.
        Clear out the on-disk cache.
        N)r  rmtreer  r  FileNotFoundErrorrU   rU   rU   rW   clear?  s
   zFxGraphCache.clearNrQ   rp   )r   rp   rQ   rp   )r   r  rQ   r  )rQ   r  )r   rp   r  r  r  rY   r  r  r  re   rQ   r  )r   rp   r   r   rQ   rR   )r   rp   r  rf   r  r  r  rY   r  r  rQ   rR   )r#  r$  rQ   rR   )r#  r$  r  r  r  r`   r  r  r  rY   rQ   r  )rQ   r  )r   rp   rj  rY  r  r  r  rY   r  r  r  rY   r  re   rQ   r  r   )r   r   r   r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  rU   rU   rU   rW   r    s:    	 :&$=r  ry   c                 C  s4   |  drtj| S |  drtj| S | dfS )zDReturns the path where the AOT Inductor compiled kernels are stored..soz.pt2r   )endswithrx   ry   split)ry   rU   rU   rW   split_aot_inductor_output_pathJ  s
   

r  c                   @  sN   e Zd ZU i Zded< eejZedddZ	edddZ
edddZdS )CudaKernelParamCachezdict[str, dict[str, Any]]r   r   rp   paramsdict[str, str]r   bin_typerQ   rR   c                 C  s8   t |||ttjjd d\}}||t < || j|< d S )Nr   )r   r   )r   r  r   r  output_pathrt   r   )clsr   r  r   r  _ry   rU   rU   rW   r  Z  s   

zCudaKernelParamCache.setr   c                 C  s   | j |d S rT   )r   r   )r  r   rU   rU   rW   r   h     zCudaKernelParamCache.getKeysView[str]c                 C  s
   | j  S rT   )r   r   )r  rU   rU   rW   get_keysl  r   zCudaKernelParamCache.get_keysN)
r   rp   r  r  r   rp   r  rp   rQ   rR   )r   rp   rQ   r   )rQ   r  )r   r   r   r   r  r   r  cache_clearclassmethodr  r   r   rU   rU   rU   rW   r  U  s   
 
r  c                   @  s   e Zd ZedddZdS )AotCodeCompilerr;  rc   wrapper_coderp   kernel_codeserialized_extern_kernel_nodesOptional[str]device_typeadditional_filesrY  rQ   Union[list[str], str]c          G   	     s
	  |}t jdkrtdt  t }tddt|jdd}	t|		 }
t
 o.dko.jtt
jj\}}t
jjrDdfd	td
|
|d\}	td|
|d\}t
jjrj|	 t
jjsj| td	 td td	fddfddd tdfddfddd t	}t}|j|  sjdd ttd }dQfdd}d d!lm} t }|tj ||d" t!d#}| |rt|"d$}t#|d%}|| W d&   n1 sw   Y  t
jjr|| t
jj$}|d'< t|%|j& d(}t
jj$' D ]\}}t(|tr.t(|ts2J d)qt#|d%}|t)*t
jj$ W d&   n	1 sOw   Y  t|%|j& d(}t+,|| t
jjry|| t
jjsy|| |rt
jjnt|"d*}t-fd+d,j./ D dRd1d2 t
jj0rd3 fd4d,j./ D }nd3}t1|}t
  o|d5k} t
jj2rd} j| t
 |d6}!tdSdt
jj d7|!}"tdSd8di|!}#tt|j&	t|j|"d9}$|$	 }%|$3 }&tt|j&t|j|#d9}'|'	 }(|'3 })t45d:|% t45d;|( t
jjrLt|%|j& d<}*|"6|* ||* |$7| |$8|	 || n|$9  |'9  | s\|}+d },nt:t;t<=d t<>t<j?j@d=A },tBCd>|d? |,}+||+t j}-d }.i }/tDjE' D ];\}.\}0}1t(|1t<jFjGsJ tH |. }2t45d@|0|2 |2|/|0< t<jFI|1}3tj |j|2}4tJ|4|3d ||4 qtj |jdA}5t#|5d%}|t)*|/ W d&   n	1 sw   Y  ||5 t<jKjLrtM ntN }6dBdC |6jOP D }7dD|7}7tQ|\}8}9t|jdE}:t|8|7r"|&|)|-|7gn|&|)|-g|9|:d9};|;	 }<|;3 }t45dF|< t#	dG}|d |dH|% d |dI|< d W d&   n	1 saw   Y  t#dG}|d |dH|( d |dI|< d W d&   n	1 sw   Y  t
jjrt|%|j& dJ}=|:6|= ||= |tR | rt|%|j& dK}>t#|>dL}?|?| |?tBCdM|, W d&   n	1 sw   Y  ||> ||- ||7 |;8||- |7S D ]	}@|;8||@ q|;T| n_|;9  |&|)|-fD ]}AtU|A q| rbd d&lV}B|BW }Ct@dN|C}Dt#|dO%}E|EX }F|EdP|D|F|D    |E| |EtBCdM|, W d&   n	1 s]w   Y  t
jjrl|| W d&   n	1 sww   Y  t
jjr|S |S )Tz
        Returns the .so path, or returns a list of files that were generated if
        config.aot_inductor.package=True.
        rn   z.AotCodeCompiler not yet supported for inductoroi)vec_isar(  aot_moder   sourcesBuildOptioncpur  r   zwrapper.cpp)r   r   z
kernel.cppzWrapper code written to: %szKernel code written to: %sZ
graph_dumpc                        dd dS )NZinductor_aot_wrapper_codecppr   rZ  r  rU   rU   )wrapper_pathrU   rW   rr       z)AotCodeCompiler.compile.<locals>.<lambda>c                     r  rT   rU   rU   )r$  rU   rW   rr    r  r  c                     r3  )NZinductor_aot_kernel_coder4  r5  rU   rU   )kernel_pathrU   rW   rr    r7  c                     r  rT   rU   rU   )r%  rU   rW   rr    r  Tr   zCMakeLists.txtconstsr   platformrp   rQ   c                   s&  |dkr j tj @ rt| dkrtdd}nd}d}n|dkr)d}d	}ntd
| t| dk}d| d}|dt d7 }|d| d7 }|| d7 }|si| D ]
}|d| d7 }qW| sh|d7 }n|d7 }|dt| d  d7 }|d| d7 }|| d7 }t|dt	d\}}t
|}t dkr ndjdd}	tt	|jt	|t	|j|	d }
|
 }|
  |rt|d!=}|d" |d}|d#}|d$ksJ || d"}|t| k r|| |d  }||7 }|t| k sW d    n	1 sw   Y  t| |S )%Nlinux 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z.ldata, "aw"z.lrodata, "a"r   darwinz__DATA,__datar  zUnsupported platform: i   z
	.section	r  z		.balign z	.globl	z_binary_constants_bin_start
z_binary_constants_bin_start:
z	.byte z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    z.globl	z_binary_constants_bin_end
z_binary_constants_bin_end:
S)r   Zxpur2  T)r(  r.  compile_onlyuse_relative_pathr   r0  
output_dirr1  zr+br   s   ͫxV4r   )Zmutated_buffersrC   r  r   ra  
ValueErrorr   r5   r   rp   r
   r(   r.  r&   stemr  get_target_file_pathbuildr   rQ  r{  findrx   r  )r9  r:  Zsection_attrZsymbol_prefixZis_large_constsZ
consts_asmr   r  Zconsts_sZobject_build_optionsZobject_builderconsts_or  hdrZ	start_idxposrc)r(  r;  specified_sub_dirrA  rU   rW   _compile_consts  s   





z0AotCodeCompiler.compile.<locals>._compile_constsr   FileLock.locktimeoutz.jsonr   NZAOTI_DEVICE_KEYz_metadata.jsonz"Metadata must only contain stringsr  c                 3  s&    | ]}| j vr |jV  qd S rT   )folded_constantsget_original_value_of_constantis_cudar   r   )r;  rU   rW   r   \  s    

z*AotCodeCompiler.compile.<locals>.<genexpr>r  torch.Tensorall_cudarY   c           	      S  s   ddd}dd l }|  dkrdS | jr#tjj| }tjj| }n|  	 }| }|
 }||||j| }t|j}|rF|S ||S )	N	raw_bytesr   rQ   c                 S  s$   |  t| t d t t d}|S )NrD       )ljustra  r5   )rZ  Zpadded_bytesrU   rU   rW   _pad_to_alignmentc  s
   zEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentr       )rZ  r   rQ   r   )ctypesZnumelr<  rq   opsZmkldnndata_ptrZ_nbytesZuntyped_storager2  nbytesr   ZPOINTERZc_ubyter   contents)	r  rY  r]  r_  ra  rb  Zt_cpuZ	raw_arrayrZ  rU   rU   rW   	_to_bytesb  s    
	
z*AotCodeCompiler.compile.<locals>._to_bytesr^  c                 3  s*    | ]}|j vr |V  qd S rT   )rT  rU  rW  )rd  rY  r;  rU   rW   r     s    
r<  )r.  r(  use_mmap_weightsrA  r-  )r@  Zmin_optimizer@  rB  z#aot wrapper compilation command: %sz"aot kernel compilation command: %sz_compile_flags.json)rD   Zqqr>  zsaving script object %s as %szcustom_objs_config.jsonc                 S  s   g | ]}|j d r|j qS )z.o)r  r  )r   entryrU   rU   rW   r    s    
z+AotCodeCompiler.compile.<locals>.<listcomp> )r-  r(  r.  rA  zaot linkage command: %saz// Compile cmd
// z// Link cmd
// z_linker_flags.jsonz_serialized_weights.binr  qi @  za+b    )r9  r   r:  rp   rQ   rp   )r  rX  rY  rY   rQ   r   rU   )Ysysr:  r   r#   r,   r&   r(   r.  r   get_command_liner   r  r  r  r  Zpackage_cpp_onlyrz   r   packagerb  r  r  r9   r
   r  r   r  rp   torch.utils._filelockrP  r   rx   ry   LOCK_TIMEOUTwith_suffixr   r9  	with_namerE  r   r   r   r   r  r   r   r  r   Zpackage_constants_in_sora  Zforce_mmap_weightsrF  rU  r  Zsave_flags_to_jsonZsave_compile_cmd_to_cmakeZsave_src_to_cmakerG  r   r  rq   randintZiinfoZint64maxitemstructpack	enumerateZtorchbind_constantsr  r  rE   Z_pickle_saver   rr   rs   ROCmCodeCacheCUDACodeCacher   r  r*   r"   r  Zsave_link_cmd_to_cmaker  resourceZgetpagesizetell)Gr  r;  r$  r%  r&  r(  r)  Zgenerated_filesZpicked_vec_isaZvec_isa_cmd_genZcpp_commandZspecified_output_pathZspecified_artifact_nameZwrapper_keyr  Zwrapper_path_operatorZkernel_path_operatorZ
cmake_pathrN  rP  r   lockZextern_kernel_nodes_jsonr  r9  Z	meta_jsonrh  ri  Zkernel_meta_jsonZ	output_soZserialized_weightsZconsts_sizere  compile_commandZwrapper_build_optionsZkernel_build_optionsZwrapper_builderZwrapper_compile_cmdZ	wrapper_oZkernel_builderZkernel_compile_cmdZkernel_oZcompile_flagsZaot_constantsZmagic_numberrI  Zcustom_obj_idxZqual_name_to_idr   ZconstantZcustom_obj_nameZcustom_obj_bytesZcustom_obj_pathZconstants_config_jsonZgpu_codecacheZgpu_kernels_ooutput_namerC  Zso_build_optionsZ
so_builderZlink_cmdZlinker_flagsZweight_fileZ	f_weightsZgpu_oZo_filerz  Z
page_size_Z	page_sizeZf_soZso_sizerU   )
rd  rY  r(  r;  r%  r8  rM  rA  r$  r6  rW   compiler  s*  







	


N


































    
#zAotCodeCompiler.compileN)r;  rc   r$  rp   r%  rp   r&  r'  r(  rp   r)  rY  rQ   r*  )r   r   r   r"  r  rU   rU   rU   rW   r#  q  s    r#  c                  C  sZ   t tjd } |  }| }t|d\}}W d    t|S 1 s$w   Y  t|S )Nr  rg  )r
   r  r  r   r{  r   r+   )ry   r  r   r  r  rU   rU   rW   cpp_prefix_patha  s   


r  c                  C  s.   t  } t rdtj|  dS d|  dS )Nz
#include "")r  r   r  rx   ry   r   r  rU   rU   rW   
cpp_prefixn  s   r  zOptional[CDLL]_libgompr   Union[list[c_void_p], c_void_p]c                   sL  d fdd  fdd|D }|  dsJ | d	 d }t| d
D ]\}}|dkr1t|}t||}q$t|sAJ | d t }t|j	j
|D ]\}}|jrW|||j< qK|rb|t| d = ||i |}	t|	ttfrdd |	D }	t|	D ]\}}
t|
tjsJ | d q{tjj|	S t|	tjsJ | d tjj|	S )Nargr   rQ   c                   sJ   t t| dkrtjj| S t| ttfr#t|  fdd| D S | S )Nz<class 'PyCapsule'>c                 3  s    | ]} |V  qd S rT   rU   r   rh  convert_argrU   rW   r     r   z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>)	rp   rZ  rq   r  _aotiZ&alloc_tensor_by_stealing_from_void_ptrr   r_  tuple)r  r  rU   rW   r  ~  s
   z&custom_op_wrapper.<locals>.convert_argc                   s   g | ]} |qS rU   rU   r   r  r  rU   rW   r    r  z%custom_op_wrapper.<locals>.<listcomp>z
torch.ops.z, can not be called through custom_op_wrapperr   r   z, can not be loaded through custom_op_wrapperc                 S  s"   g | ]}|d u rt g n|qS rT   )rq   Ztensor)r   rrU   rU   rW   r    s   " z returns a list of non-tensorsz returns a non-tensor)r  r   rQ   r   )
startswithrw  r  	importlibimport_moduler  callablerc  zipZ_schema	argumentsZ
kwarg_onlyr   ra  r   r_  r  rq   r   r  r  Z#unsafe_alloc_void_ptrs_from_tensorsZ!unsafe_alloc_void_ptr_from_tensor)r   rO   Zconverted_argsfuncr,  rC  rP   Zfunc_argZconv_argresultr  rU   r  rW   custom_op_wrapper{  s4   	

r  c                   @  sr   e Zd ZU i Zded< eejZi Zded< edd
dZ	e
dddZe
			ddddZe
ddddZdS ) CppCodeCache0dict[str, Callable[[], Union[CDLL, ModuleType]]]r   r~   cpp_compile_command_flagsry   rp   r   rQ   Union[CDLL, ModuleType]c                 C  s
   t | S rT   )r   LoadLibrary)ry   r   rU   rU   rW   _load_library_inner  r   z CppCodeCache._load_library_innerc              
   C  s   z|  ||}||_|W S  ttfyU } z;dt|v r8tjdr8t	da
|  ||}||_|W  Y d }~S dt|v rPt| dt  dt  d| d }~ww )NZgompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorr  rp   rx   ry   r   r   r  r  tempfile
gettempdir)r  ry   r   r  r   rU   rU   rW   _load_library  s*   
zCppCodeCache._load_libraryr2  NrU   r  r(  	submit_fnr   extra_flagsSequence[str]c                   sR  i j |t |d}t  tddtdi |d}t| }t|d|d\}jvrddl	m
}	 tjt d	 }
t|\}}d d tdi |d
t oV|dki}t||||d}tt|
|t|  d fdd}|d ur|	|
td tj s|W d    n1 sw   Y  |j< j S )N)r(  r-  r  r+  r,  r/  r4  r   r   rO  rQ  rA  r2  rB  rQ   r   c                    sF   d u r!d ur    } | d u sJ  d us!J S rT   )r  r  )r  Zbinary_pathr  futurer   r  Z	worker_fnrU   rW   load_fn  s   z(CppCodeCache.load_async.<locals>.load_fnrR  rU   rQ   r   )r  r,   r#   r&   r(   r   rl  r   r   rn  rP  rx   ry   rz   r   r*   r   r  r   r	   _worker_compile_cppr+   rF  ro  r   )r  r  r(  r  r  r}  command_genZvec_isa_cmd
input_pathrP  	lock_pathr~  rC  Zcpp_build_optioncpp_builderr  rU   r  rW   
load_async  sZ   


zCppCodeCache.load_asyncc                 C  s   |  || S rT   )r  )r  r  r(  rU   rU   rW   r     r  zCppCodeCache.load)ry   rp   r   rp   rQ   r  )r2  NrU   )
r  rp   r(  rp   r  r   r  r  rQ   r   )r2  )r  rp   r(  rp   rQ   r   )r   r   r   r   r  r   r  r!  r  r  r"  r  r  r   rU   rU   rU   rW   r    s   
 
Hr  r  r  r&   c                 C  sf   ddl m} || td tj| s!|  W d    d S W d    d S 1 s,w   Y  d S )Nr   rO  rR  )rn  rP  ro  rx   ry   r   rF  rG  )r  r  rP  rU   rU   rW   r    s   
"r  c                   @  sx   e Zd ZU i Zded< eejZdddZdZ	dZ
dZed	Zed%ddZe				d&d'dd Zed(d#d$ZdS ))CppPythonBindingsCodeCacher  r   FTZinclude_pytorchZsharedr  zkernel({}); Py_RETURN_NONE;r   aR  
        // Python bindings to call {entry_func}():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {{
            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }}
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }}
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }}

        {extra_parse_arg}

        static PyObject* {entry_func}_py(PyObject* self, PyObject* args) {{
            try {{
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != {arg_len}))
                    throw std::runtime_error("requires {arg_len} args");
                {call_entry_func}
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}

        static PyMethodDef py_methods[] = {{
            {{"{entry_func}", {entry_func}_py, METH_VARARGS, ""}},
            {{NULL, NULL, 0, NULL}}}};

        static struct PyModuleDef py_module =
            {{PyModuleDef_HEAD_INIT, "{entry_func}", NULL, -1, py_methods}};

        PyMODINIT_FUNC PyInit_{entry_func}(void) {{
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {{
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }}
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            PyObject* module = PyModule_Create(&py_module);
            if (module == NULL) {{
                return NULL;
            }}
            #ifdef Py_GIL_DISABLED
                PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
            #endif
            return module;
        }}
        ry   rp   r   rQ   r   c                 C  s   t tjjjjtjd< | d| j }zt	j
| W S  ty"   Y nw tj||}|d us0J tj|}|t	j
|< |j| |S )NZ'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRr   )rp   rq   r  Z_dynamor  Z'_torchinductor_pyobject_tensor_data_ptrrx   environentry_functionrk  r  KeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)r  ry   r   module_namer  r  rU   rU   rW   r    s   


z.CppPythonBindingsCodeCache._load_library_innerr2  r   NrU   argtypesrY  r  r(  num_outputsr  r  r   r  r  c           
        st   d dd t|D } jjt| j| j jj|dd} j|| |||ddd fd
d}	|	S )a5  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            source_code: C++ source code containing a ENTRY_FUNCTION() function

        Returns:
            A python version of ENTRY_FUNCTION()
        , c                 s  s.    | ]\}}d | dd d| dV  qdS )z
parse_arg<zconst r   z>(args, )N)r  )r   nZargtyperU   rU   rW   r     s
    
zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>)Z	array_len)Zarg_lenZcall_entry_funcZ
entry_funcextra_parse_arg)r  r  NrQ   r   c                     s(   d u r t tsJ t jS rT   )r   r   r  r  rU   r  Z
get_resultr  rU   rW   r    s   z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.futurer  )	rz   rw  suffix_templateformatra  call_entry_functionr  r  r  )
r  r  r  r(  r  r  r  Z	parseargssuffixr  rU   r  rW   load_pybinding_async  s$   

z/CppPythonBindingsCodeCache.load_pybinding_asyncrO   rP   c                 O     | j |i | S rT   )r  r  rO   rP   rU   rU   rW   load_pybinding     z)CppPythonBindingsCodeCache.load_pybinding)ry   rp   r   rp   rQ   r   )r2  r   NrU   )r  rY  r  rp   r(  rp   r  r  r  r   r  r  rQ   r   )rO   r   rP   r   rQ   r   )r   r   r   r   r  r   r  r!  r  r  r  r  textwrapdedentr  r"  r  r  r  rU   rU   rU   rW   r  #  s,   
 
Y.r  c                   @  s@   e Zd ZU i Zded< eejZdddZdZ	dZ
edZdS )	CppWrapperCodeCacher  r   Tr  Zinductor_entry_cppzreturn inductor_entry_cpp({});a	  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {{
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }}
            return result;
        }}

        static inline PyObject* pack_tensor_handle_list(const std::array<AtenTensorHandle, {array_len}>& arr) {{
            PyObject* result = PyList_New({array_len});
            for (size_t i = 0; i < {array_len}; i++) {{
                PyObject *elem =
                    arr[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(arr[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }}
            return result;
        }}

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {{
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }}

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {{
            // For outputs, we only allocate an array to hold returned tensor handles,
            // not the actual output tensor storage.
            std::array<AtenTensorHandle, {array_len}> output_handles{{}};
            try {{
                inductor_entry_impl(input_handles.data(), output_handles.data());
                if (PyErr_Occurred()) {{
                    return nullptr;
                }}
                return pack_tensor_handle_list(output_handles);
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}
        N)r   r   r   r   r  r   r  r!  r  r  r  r  r  r  rU   rU   rU   rW   r    s   
 
r  c                   @  s   e Zd ZU i Zded< eejZdZded< e	
dZee	
d Zee	
d Ze	
d	Zed0ddZed1ddZeedd2ddZed3ddZeedd4d d!Zeedd4d"d#Ze	d5d6d(d)Zed7d,d-Zed2d.d/ZdS )8HalideCodeCachez0dict[str, Callable[[], Union[ModuleType, CDLL]]]r   Nr'  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        r   rp   r  ri   r   rY   rQ   rY  c                 C  sV  |j d usJ |jd urt|j t|jksJ |jd usJ |jp$|j d|j }|r:d| d}d}d}d}nd}d}d| d}d	}g }	t|j |jD ]\}
}|	d
|
 d| d qOd| dd| dd|	 d| d| d| d| d| d| d| d| d| d|	  d| dt|	 d| d| d| dg
S )Nz + zreinterpret_cast<uint64_t>(r  Zcuda_interfaceZnullptrZhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(Zhalide_buffer_flag_host_dirtyzhalide_dimension_t(0, r  zhalide_buffer_t ;zhalide_dimension_t z_dims[] = {z};z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestridera  offsetalias_ofr   r  rb  rz   Zhalide_type)r  r   r  r   ra  r   Zdevice_interfacehostflagsdimssizer  rU   rU   rW   _codegen_buffer\	  s6   "
zHalideCodeCache._codegen_bufferr   rj   
headerfileobjectc           
      C  s   |  }|d|jv u sJ d|jv sJ g }g }t|jD ]+\}}| r;|d|  || d| || qd|jvsBJ ||j	 qd
dd |D  }|rZ| jn| j}|j| |red	nd
|d
dd |jD |d
|d}	|	S )NZuser_contextZ
no_runtimez&hl_buf_Zhl_buf_*r  c                 S  s   g | ]}d | qS )    rU   )r   linerU   rU   rW   r  	  s    z1HalideCodeCache._codegen_glue.<locals>.<listcomp>HalideRuntimeCuda.hzHalideRuntime.hr  c                 s  s.    | ]}|j d u r|  d|j V  qd S )Nrg  )r  bindings_typer   r  rU   rU   rW   r   	  s    
z0HalideCodeCache._codegen_glue.<locals>.<genexpr>)Zhalideruntime_hr  Zargdefsbuffersbuffer_names)rV  r  rw  r  Z	is_bufferrb  extendr  ctyper   rz   lstripglue_template_cudaglue_template_cppr  find_header)
r  r   r  rV  r  r  r,  r  Zglue_templateZ	glue_coderU   rU   rW   _codegen_glue~	  s2   

zHalideCodeCache._codegen_gluec                 C  s:   t ddt d}| }td| j| j| j|gdS )NOIr/  r  r   )	r&   r'   rl  r   rz   r  r  standalone_runtime_cuda_initr   )r  r  Zcommand_linerU   rU   rW   config_hash	  s    zHalideCodeCache.config_hashr  errmsgc           	   
   C  s   t jjd}|d u s|jstdzX|jd }t|D ]H}|drezt	
dtj||g}W n
 t	jy<   Y qw td|d}|retjtj|d| }tj|retj|  W S qW t| ty{ } zt||d }~ww )	Nhalidez$halide python bindings not installedr   r  Zlddz(/.*)/libHalide.sor   rD   )r  	machinery
PathFinderry  r~  r   rx   r  r  
subprocesscheck_outputry   rz   SubprocessErrorrI  searchr   abspathgroupr   r  )	r  r  r  r  fileoutmry   r   rU   rU   rW   _search_for_file	  s4   


z HalideCodeCache._search_for_filec                 C  sV   d|    d}dtjv rtjtjd |}tj|r|S d| d}t||S )NZlibautoschedule_r  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r   rx   r  ry   rz   r   r  r  )r   sofilery   r  rU   rU   rW   find_libautoschedule	  s   

z$HalideCodeCache.find_libautoschedulec                 C  s   dt jv rt jt jd | }t j|r|S dt jv r5t jt jt jd d|  }t j|r5|S d|  d}td|  |S )NZHALIDE_INCLUDEr  z../include/r  z7, set env HALIDE_INCLUDE to the directory containing it)rx   r  ry   rz   r   r  r  r  )r   ry   r  rU   rU   rW   r  	  s   


zHalideCodeCache.find_headerr  r  r   Callable[[], Any]c              
     s  t tt|t|  |fddd }tj|dd d t|d }t|d }t|d }t|d	 }t|d
 }	tj	| }
g }|
rwt
|| tj|ddd| ddddg
}|jrf|d| |jg ||  |ttj| dd |jD }| r|d | j|| ||||  f|
r|jnd | rdndd |
r|tt| tt|	|}|r||jn|  d fdd}|S )Nr  r     Tr   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdoner|  -gr  -oz-fZhalide_kernelz-ezstatic_library,h,schedulez-pc                 S  s   g | ]}|j d u r| qS rT   )r  r  r  rU   rU   rW   r  
  s    z9HalideCodeCache.generate_halide_async.<locals>.<listcomp>Z	uintptr_tr   r2  )r  r  r(  rQ   r   c                     s   r    S rT   rU   rU   Zbindings_futureZwait_for_compilerU   rW   r   '
  s   z3HalideCodeCache.generate_halide_async.<locals>.load)rQ   r   )r
   r   r   r   r  rx   r   rp   ry   r   r   rk  
executableZ	schedulerr  r  rO   rb  r   r	   r  
check_callr  rV  r  r  build_standalone_runtimetouch_worker_task_halider  )r  r   r  r  dirpathZgenfileZlibfiler  donefilelockfileZneed_compilejobscmdZbinding_typestaskr   rU   r  rW   generate_halide_async	  sr   	



z%HalideCodeCache.generate_halide_asyncrO   rP   c                 O  r  rT   )r  r  rU   rU   rW   generate_halide.
  r  zHalideCodeCache.generate_halidec              	   C  s  | j rtj| j r| j S tj rdnd}d}|dkrdnd}| j r0tj| j r,J t }nt }t	|d| d| 
   }tj|dd	 t|d
 }t|d }t|d }t|d }	t|| }
tj|sdd l}ddlm} ||t] tj|st|d}|dkr|| j| d W d    n1 sw   Y  ||	|| t|
\}}t|||	g|t|dd}tt|  t | W d    n1 sw   Y  tj|
sJ |
| _ |
S )Nr   r2  zlibStandaloneHalideRuntime.soz	host-cudar  zhalide-runtime--Tr   r  r|  z	hooks.cppzstandalone_halide_runtime.ar   rO  r   r  r(  rB  )!r  rx   ry   r   rq   r   r  r4   r3   r
   r  r   rp   r  rn  rP  ro  r   r   r  r  r  Zcompile_standalone_runtimeZTargetr*   r&   r(   r  r  shlexr  rl  r	  )r  r(  Zlibnamer  baser  r  r  ZhookfileZafiler  ZhlrP  r  r   rC  Zhalide_cmd_genrU   rU   rW   r  2
  sf   	z(HalideCodeCache.build_standalone_runtime)r   rp   r  ri   r   rY   rQ   rY  )r   rj   r  r  rQ   rp   r  )r  rp   r  rp   rQ   rp   )r   rp   rQ   rp   rT   )r   rj   r  rp   r  r   rQ   r   )rO   r   rP   r   rQ   r   )r   r   r   r   r  r   r  r!  r  r  r  ro  r  r  r  r"  r  r  r   r   r  r  r  r  r  r  r  rU   rU   rU   rW   r  	  sL   
 
	!!Dr  r  r  list[partial[Any]]c                 C  sZ  ddl m} z"|| t |D ]}|  qW d    W d S 1 s!w   Y  W d S  tjy } zwtjddkrt|dd^}}}tj	
|drt| }d}	||	d	ks\J G d
d d}
|
 ||dd	 < ttddg|dd}||	|}tdd}||  W d    n1 sw   Y  td| | d }~ww )Nr   rO  ZHALIDE_REPRO1r  )r   r   r   pythonz    hl.main()rD   c                   @     e Zd ZdddZdS )z _worker_task_halide.<locals>.OutrQ   rp   c                 S  rZ   )Nr  rU   r   rU   rU   rW   __repr__~
  rX   z)_worker_task_halide.<locals>.Out.__repr__Nr  )r   r   r   r  rU   rU   rU   rW   Out}
      r  r  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        r  r   zwrote repro.py: )rn  rP  ro  r  r  rx   r  r   r  ry   r   r  r   r{  countindexr  r   r  r  r   r  r   )r  r  rP  Zjobr   r  scriptr  r   mainr  replfdrU   rU   rW   r
  n
  s@   &r
  r  c                 C  s   t | d  d S )Nrh  )r   closer  rU   rU   rW   r	  
  s   r	  c                   @  s   e Zd ZU g Zded< i Zded< ed$d%ddZe			d&d'ddZe		d(d)ddZ	ed*d+ddZ
eedd,d"d#ZdS )-PyCodeCachezlist[ModuleType]r  z dict[str, list[tuple[Any, ...]]]linemapsr   r  rp   r   rQ   r   c                 C  s   t |d|dS Npyr  r   )r  r  r   rU   rU   rW   r   
  r  zPyCodeCache.writeNlinemapOptional[list[tuple[int, str]]]attrsr   r   c                 C  s"   t |d|d\}}| ||||S r'  )r   load_by_key_path)r  r  r   r)  r+  r   ry   rU   rU   rW   r   
  s   zPyCodeCache.loadr   ry   c                 C  sv   |d u rg }t ||}tt| | j|< |d ur'| D ]
\}}t||| q|s3|s3tt|||_	| j
| |S rT   )r1   r_  r  r&  r   setattrr   r	   r2   Z_reload_in_subprocr  rb  )r  r   ry   r)  r+  modrh  ri  rU   rU   rW   r,  
  s   
zPyCodeCache.load_by_key_pathFpurgerY   rR   c              	   C  sJ   |r| j D ]}z|jsJ t|j W q ty   Y qw | j   dS )z
        Clear the in-memory module cache. If purge=True, also delete all the
        corresponding on-disk source files.
        N)r  r  rx   r  r  r  )r  r/  r.  rU   rU   rW   r!  
  s   

zPyCodeCache.cache_clearlinenor  Optional[list[dict[str, Any]]]c                 C  sX   || j vrd S | j | \}}t||}|dkrd S ||d  }|s#d S d	dd}||S )
Nr   rD   stack_tracerp   rQ   list[dict[str, Any]]c                 S  s"   d}t || }dd t|D S )Nz"File "(.+)", line (\d+), in (.+)\nc                 S  s"   g | ]\}}}|t ||d qS ))r  r  r   )r  )r   r  lr  rU   rU   rW   r  
  s    zPPyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace.<locals>.<listcomp>)rI  findallreversed)r2  regexmatchesrU   rU   rW   parse_stack_trace
  s
   z<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace)r2  rp   rQ   r3  )r&  r   )r  ry   r0  rd  r  r  rf  r9  rU   rU   rW   stack_frames_for_code
  s   



z!PyCodeCache.stack_frames_for_coder   )r  rp   r   rp   rQ   r   )r   NN)
r  rp   r   rp   r)  r*  r+  r   rQ   r   )NN)
r   rp   ry   rp   r)  r*  r+  r   rQ   r   rk  )r/  rY   rQ   rR   )ry   rp   r0  r  rQ   r1  )r   r   r   r  r  r&  r"  r   r   r,  r!  r   r   r:  rU   rU   rU   rW   r%  
  s&   
 
r%  kernel_namer  rk   c                 C  s   t t|| S rT   )r  r%  r   )r<  r  rU   rU   rW   _load_triton_kernel_from_source
  s   r=  r'  c                   C  s~   t tjjrtjjS t rtjt	j
ddS t tdr&tddS t tdr=tjtjtdddS dS )NbinZnvccZCUDACXXr   Z	CUDA_HOMEzbin/nvcc)r   Z
nvcc_existr   r   Zcuda_cxxr  rx   ry   rz   rJ   Zsdk_homegetenvrealpathrU   rU   rU   rW   _cuda_compiler
  s   rA  rY  c               	   C  s|   t  rddlm}  | d}nt jj}tj	tj
|dtj	tj
|dtj	tj
|dtj	tj
|dgS )Nr   r  zcutlass-3-headersincludeztools/library/includeztools/library/srcztools/util/include)r   r  r  r  Zget_dir_pathr   Zcutlass_dirrx   ry   r@  rz   )r  Zcutlass_pathrU   rU   rW   _cutlass_include_paths
  s   rC  c                  C  sv   t   ddlm}  | jdd}g }t r7t| |D ]}|d| dd| g q|d |d	 |S td
)Nr   )cpp_extensionr   r  z-Lz-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)	r#   Ztorch.utilsrD  Zlibrary_pathsr7   r%   r  rb  NotImplementedError)rD  ZlpathsZextra_ldflagsry   rU   rU   rW   _cuda_lib_options  s   

rF  c                   C  s   g dS )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-WconversionrU   rU   rU   rU   rW   _nvcc_host_compiler_options/     rG  c                  C  s   t  } | dkr
d} d|  d|  g}tjjr |d|  g7 }dddd	d
d|  dd| dtjjdddg
}t rH|dt	j
tjg tjjrS|g d tjjr^|g d tjjri|ddg |S )NZ90Z90aZsm_Zcompute_Zlto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z+-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1z'-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLEDz-wz-gencode=arch=compute_z,code=[,]z
-std=c++17z--expt-relaxed-constexprz-DNDEBUGz-ccbin)z	-lineinfor  z-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r   Zget_cuda_archr   r   Zenable_cuda_ltorz   Zcompile_opt_levelr  r  rx   ry   r  rJ   ZgccZenable_debug_infoZenable_ptxas_infoZuse_fast_math)archr   optionsrU   rU   rW   _nvcc_compiler_options8  s@   	rM  	src_filesdst_filedst_file_ext
extra_argsOptional[list[str]]c                 C  s  |d u rg }t  }t }t }t }|| dd |D  dd |D  | }d| }	d}
|dkrCt  dd| d| d|	 }
n9|dkr^|d	 t  dd| d
| d|	 }
n|dkrtt  dd| d
| d|	 }
ntd| dt	d|
 |
S )Nc                 S  s(   g | ]}d |v rd| nd| qS )=z-Xcompiler z-Xcompiler=rU   )r   optrU   rU   rW   r  s  s    z(cuda_compile_command.<locals>.<listcomp>c                 S  s   g | ]}d | qS )z-IrU   )r   ry   rU   rU   rW   r  w  r  rg  r   r+  z -c -o soz-sharedz -o exezUnsupported output file suffix !zCUDA command: %s)
rC  rF  rG  rM  rz   rA  rb  rE  rU  r  )rN  rO  rP  rQ  Zinclude_pathsZcuda_lib_optionsZnvcc_host_compiler_optionsZnvcc_compiler_optionsrL  Zsrc_fileresrU   rU   rW   cuda_compile_commandd  s<   

$
$$rY  c                   @  sV   e Zd ZdZdddZddd	Zdd
dZdddZdddZdddZ	dddZ
dS )
DLLWrapperz A wrapper for a dynamic library.lib_pathrp   rQ   rR   c                 C  s"   || _ d| _t|| _d| _d S )NFT)r[  is_openr   r  DLL)r   r[  rU   rU   rW   r     s   
zDLLWrapper.__init__c                 C  s   | j r|   d| _ d S d S r[   )r\  _dlcloser   rU   rU   rW   r$    s   
zDLLWrapper.closec                 C  s   d }t  rtd }t|dstd}t|dr|j}nt r-dd l}|jddd}|j}ntd|d ur`t  rDtg|_	|| j
j d S t r^dd l}ddlm} |jg|_	|| j
j d S d S td	 d S )
Ndlclosezlibc.sor   kernel32T)Zuse_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)r7   r   r  r_  r8   r_  ZFreeLibraryrE  r   r  r]  Z_handlera  ZHMODULErU  rV  )r   Z	f_dlcloseZsymsr_  r`  ra  rU   rU   rW   r^    s4   


zDLLWrapper._dlcloser   Callable[..., None]c                   s4   | j std| j t| j| d fdd}|S )	NzCannot use closed DLL library: rO   r   rQ   rR   c                    s     |  }|rt d j d S )NzError in function: )r   r   )rO   errmethodrU   rW   _wrapped_func  s   z-DLLWrapper.__getattr__.<locals>._wrapped_funcrO   r   rQ   rR   )r\  r   r[  r  r]  )r   r   rf  rU   rd  rW   __getattr__  s
   zDLLWrapper.__getattr__r   c                 C  r  rT   rU   r   rU   rU   rW   	__enter__  rX   zDLLWrapper.__enter__rO   r   c                 G     |    d S rT   r$  )r   rO   rU   rU   rW   __exit__  r  zDLLWrapper.__exit__c                 C  rj  rT   rk  r   rU   rU   rW   __del__  r  zDLLWrapper.__del__N)r[  rp   rQ   rR   r   )r   rp   rQ   rb  )rQ   r   rg  )r   r   r   r  r   r$  r^  rh  ri  rl  rm  rU   rU   rU   rW   rZ    s    

	

#

rZ  c                   @  sj   e Zd ZU ejG dd dZi Zded< eej	Z
dZedddZe	ddddZedddZdS )ry  c                   @     e Zd ZU ded< ded< dS )zCUDACodeCache.CacheEntryrp   r  r  Nr   r   r   r  rU   rU   rU   rW   
CacheEntry     
 rp  dict[str, CacheEntry]r   cur  rp   rP  rQ   r   c                 C  .   t tdgd|}t|| j|d\}}||fS z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        Zdummy_inputZdummy_outputr  )r   rY  r   _SOURCE_CODE_SUFFIXr  r  rP  Zcuda_commandr   r  rU   rU   rW   r        
zCUDACodeCache.writeNrQ  rR  r   c                 C  s  |  ||\}}|| jvrddlm} t }|tj||d td}| |dt	| j
  | }	tj|	st|g|	||}
t|d}| d | d|
 d W d   n1 s^w   Y  t }td	|
 |
d
}ztj|tjtjd W n tjy } zt||j|d}~ww t }d||  d|
 }t| ntd| t||	| j|< W d   n1 sw   Y  | j| j||fS )z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   rO  rQ  rR  Nrh  r  z// CUDA Compile cmd
// zCUDA Compilation: %srg  )stderrenvzCUDA Compilation took  seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)r   r   rn  rP  r   rx   ry   rz   ro  ra  rv  r   rY  r   r   rU  r  r  r  r  STDOUTr  CalledProcessErrorr   CUDACompileErroroutputr  ry  rp  r  )r  r  rP  rQ  r   r  rP  r   r|  r  r  r  
start_time	cmd_partserrorend_timelog_duration_msgrU   rU   rW   r    sH   





zCUDACodeCache.compiletuple[DLLWrapper, str, str]c                 C  <   |dkrt d| d| | ||\}}}t|||fS z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        rU  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   r  rZ  r  r  rP  Zdst_file_pathr   Zsource_code_pathrU   rU   rW   r        
zCUDACodeCache.loadr  rp   rP  rp   rQ   r   rT   r  rp   rP  rp   rQ  rR  rQ   r   r  rp   rP  rp   rQ   r  )r   r   r   r  	dataclassrp  r   r  r   r  r!  rv  r"  r   r  r   rU   rU   rU   rW   ry    s   
 
+ry  c                   @  sn   e Zd ZU ejG dd dZi Zded< eej	Z
dZdZedddZe	ddddZedddZdS )rx  c                   @  rn  )zROCmCodeCache.CacheEntryrp   r  r  Nro  rU   rU   rU   rW   rp  .  rq  rp  rr  r   r4  Fr  rp   rP  rQ   r   c                 C  rt  ru  )r   r    r   rv  rw  rU   rU   rW   r   8  rx  zROCmCodeCache.writeNrQ  rR  r   c                 C  sx  | j sd| _ tttt  | ||\}}|| jvrddlm	} t
 }|tj||d td}|v |dt| j  | }	tj|	st|g|	||}
t }|
d}ztj|tjdtjd}td	| W n tjy } zt||j|d}~ww t }d
||  d|
 }t| ntd||	 t||	| j|< W d   n1 sw   Y  | j| j ||fS )z
        Compiles source_code into a file with dst_file_ext extension,
        using the compile command specific for the ROCm platform.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        Tr   rO  rQ  rR  Nrg  )ry  r   rz  zCompilation output: %szCompilation took r{  z+Skip compiling %s: output %s already exists)!_logged_compiler_versionrU  r  r)   rp   r!   r   r   rn  rP  r   rx   ry   rz   ro  ra  rv  r   r    r   r  r  r  r|  r  r}  r   r~  r  r  rx  rp  r  )r  r  rP  rQ  r   r  rP  r   r|  r  r  r  r  r  r  r  r  rU   rU   rW   r  G  sN   	


zROCmCodeCache.compiler  c                 C  r  r  r  r  rU   rU   rW   r   y  r  zROCmCodeCache.loadr  rT   r  r  )r   r   r   r  r  rp  r   r  r   r  r!  rv  r  r"  r   r  r   rU   rU   rU   rW   rx  ,  s   
 
1rx  c                   @  r  )CodeCacheFuturerQ   Callable[..., Any]c                 C  s   t rT   )rE  r   rU   rU   rW   r    rX   zCodeCacheFuture.resultNrQ   r  )r   r   r   r  rU   rU   rU   rW   r    r  r  c                   @  s$   e Zd Z	dddd	Zdd
dZdS )LambdaFutureN	result_fnr  r  Optional[Future[Any]]rQ   rR   c                 C  s   || _ || _d S rT   )r  r  )r   r  r  rU   rU   rW   r     s   
zLambdaFuture.__init__c                 C  s   |   S rT   )r  r   rU   rU   rW   r    rH  zLambdaFuture.resultrT   )r  r  r  r  rQ   rR   r  )r   r   r   r   r  rU   rU   rU   rW   r    s    r  )rO   r   rP   r   rQ   rR   )rQ   rY   r  )ru   rp   rQ   rv   )r   r   rQ   rp   r;  )r   r   r   r   rQ   rp   )r   rp   r   rp   r   rp   rQ   r   )r   r   )r   r   r   rp   r   rp   rQ   rp   )r   r   r   )r   r   r   rp   r   rp   r   rp   r   rp   rQ   r   )r   rp   rQ   rp   )FF)
r   rp   r   r   r   rY   r   rY   rQ   rR   )r  rm   rQ   rm   )r  r   rQ   r<   )rm  rn  ro  rp   rp  rq  rQ   rR   )rQ   r   )
r#  r$  r  r  r  r`   r  r  rQ   r  )r  r  rQ   r  )ry   rp   rQ   r   )r   rp   rO   r   rQ   r  )r  rp   r  r&   rQ   rR   )r  rp   r  r  rQ   rR   )r  rp   rQ   rR   )r<  rp   r  rp   rQ   rk   )rQ   r'  )rQ   rY  rT   )
rN  rY  rO  rp   rP  rp   rQ  rR  rQ   rp   )
__future__r   r   r*  r  r   r   r  r&  r  r   loggingrx   r  rv  rI  r  r  ru  r  rk  r  r  r	  r?  bisectr   r   r_  r   r   r   datetimer   r	   pathlibr
   r   r   typesr   typingr   r   r   r   r   r   r   r   Ztyping_extensionsr   rq   Ztorch.distributedr  r  r   r   Ztorch._dynamo.utilsr   r   r   Ztorch._inductorr   r   r   Ztorch._inductor.codegen.cudar   Z,torch._inductor.codegen.rocm.compile_commandr    r!   Ztorch._inductor.cpp_builderr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   Ztorch._inductor.cpu_vec_isar,   Z!torch._inductor.custom_graph_passr-   r.   Ztorch._inductor.freezing_utilsr/   r0   Z%torch._inductor.runtime.compile_tasksr1   r2   Z%torch._inductor.runtime.runtime_utilsr3   r4   Ztorch._inductor.utilsr5   r6   r7   r8   Ztorch._loggingr9   Ztorch._subclasses.fake_tensorr:   r;   r<   Ztorch._utils_internalr=   Ztorch.compilerr  Ztorch.compiler._cacher>   r?   Z%torch.fx.experimental.symbolic_shapesr@   rA   rB   Ztorch.utils._ordered_setrC   Zpackage.pt2_archive_constantsrE   r  rF   ZruntimerG   Zruntime.autotune_cacherH   Ztriton_bundlerrI   r  Z	triton.fbrJ   Ztorch._inductor.fb.utilsrK   rL   rM   rN   collections.abcr\   r]   r^   concurrent.futuresr_   r  r`   ra   r;  rc   Zirrd   ro   re   rf   rg   rh   Zruntime.hintsri   rj   Zruntime.triton_heuristicsrk   r  rl   rm   r:  r  ro  Z_loggingZgetArtifactLoggerr   r  	getLoggerrU  rt   r   r}   r{   r   r   r   r   r   r   r   r   r   r   r  r  r  r!  Picklerr"  r}  r  r  r  r  r=  rX  r  r  r  r  r  r#  r  r  r  r  r  r  r  r  r  r  r
  r	  r%  r=  rA  rC  rF  rG  rM  rY  rZ  ry  rx  r  r  rU   rU   rU   rW   <module>   sT   (
0







E
[
	




 
4
"	 

   I
   s

-n *@  a&_	0%KW^