a
    h_                 	   @  s	  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z" d dlm#Z#m$Z$ d dl%m&Z& d d	l'm'Z'm(Z( d d
l)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7 d dl8Z8d dl9m:Z; d dl8m<Z<m=Z= d dl>m?Z? d dl@mAZAmBZBmCZC d dlDmEZEmFZFmGZG d dlHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^ d dl_m`Z` d dlambZbmcZcmdZd d dlemfZfmgZg d dlhmiZi d dljmkZkmlZl d dlmmnZnmoZompZpmqZq d dlrmsZs d dltmuZumvZvmwZw d dlxmyZy d dlzmEZ{ d d l|m}Z}m~Z~mZ d d!lmZmZ d d"lmZ d d#lmZmZmZ d d$lmZ d%d&lmZ d%d'lmZ d%d(lmZ d%d)lmZ d%d*lmZ d%d+lmZ eE rd d,lmZ d d-lmZmZmZmZ nDd.d.d/d0d1d2Zd.d.d/d0d3d4Zd.d.d/d0d5d6Zd7d8d9d:Ze3d;Ze2rd d<lmZmZmZ d d=lmZ d%d>lmZ d%d?lmZ d%d@lmZ d%dAlmZ d%dBlmZmZ d%dClmZmZ d%dDlmZmZ d%dElmZ d%dFlmZ ejdGkZdHZe8jedIZeeZd7d8dJdKZdLd8dMdNZdLdLdOdPdQZejdLdRdSdTdUZG dVdW dWZG dXdY dYeǃZG dZd[ d[eǃZdLd8d\d]Zd^dLd_d`daZːddcdcdLdddedfZ̐ddLdLdLdgdhdidjZ͐ddcdLdLdLdldmdnZΐddcdLdLdLdLdodpdqdrdsZdLdLdtdudvZАd	dLdcd7d7d/dxdydzZejG d{d| d|Zd;d;d}d~dZdddddZG dd dejփZddLdd/dddZdddddZed^d8ddZdLd8ddZejG dd dZG dd de݃ZG dd dZddddddddZdddddZG dd de/e Ze~jG dd de}ZG dd dee ZejdLdpdddZeoG dd dZG dd dZdaded< dLd.ddddZejel dZejedZejdLdLd.dLdddZdLdodOddZd
dLd7dLdddZeoG ddĄ dăZdLdd/dƜddȄZeoG ddʄ deZeoG dd̄ deZeoG dd΄ deZdLdd/dМdd҄ZdLd/dӜddՄZeoG ddׄ d׃ZdLdLddٜddۄZdod8dd݄ZdLd8dd߄Zdd8ddZdLddddZdd8ddZ ed^d8ddZdd8ddZdd8ddZdLd8ddZdd8ddZdddLdLddLdddZG dd dZe#dLdLdddZeoG dd dZ	eoG dd dZ
G dd  d ZG dd deZG dd deZdS (      )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)	lru_cachepartial)Path)timetime_ns)
ModuleType)	AnyCallablecastGenericNoReturnOptionalTYPE_CHECKINGTypeVarUnion)overrideSelf)SymIntTensor)	SkipFrame)CompileEventLoggercountersdynamo_timed)configexcmetrics)custom_backend_passesinit_backend_registration)cuda_env)rocm_compile_commandrocm_compiler)in_toplevel_process)_LINKER_SCRIPT_set_gpu_runtime_env_TORCH_PATH_transform_cuda_pathsconvert_cubin_to_obj
CppBuilder
CppOptionsCppTorchDeviceOptionsget_compiler_version_infoget_ld_and_objcopy&get_name_and_dir_from_output_file_pathnormalize_path_separator)pick_vec_isa)CustomGraphModulePassCustomGraphPassCustomGraphPassType)has_frozen_paramsis_frozen_param)_reload_python_module)	cache_dirdefault_cache_dir)ALIGN_BYTESclear_on_fresh_cacheis_linux
is_windows)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)log_cache_bypass)r    )CacheArtifactCacheArtifactFactoryCacheArtifactManager)TensorPropertiesWeights)CUSTOM_OBJ_FILENAME_PREFIX)has_hinthint_intShapeEnv)
OrderedSet   CompiledFxGraph)create_cache)autotune_cache)AutotuneCacheBundler)TritonBundler)V)build_paths)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cacher   Noneargskwargsreturnc                  O  s   d S N r`   ra   rd   rd   G/var/www/auris/lib/python3.9/site-packages/torch/_inductor/codecache.pyrZ      s    rZ   c                  O  s   d S rc   rd   re   rd   rd   rf   r[      s    r[   c                  O  s   d S rc   rd   re   rd   rd   rf   r\      s    r\   boolrb   c                   C  s   dS NFrd   rd   rd   rd   rf   r]      s    r]   T)	GeneratorKeysViewSequence)Future)_CompileFxKwargs)BuildOptionsBaseGraphLowering)ChoiceCaller)CompiledFxGraphConstants
OutputCode)
JsonDataTyRemoteCache)HalideInputSpec
HalideMeta)CachingAutotuner)	InputTypewin32iX  output_codec                  C  s,   t  r(tt s(ddlm}  |   S dS )z5
    Use for CUTLASS compilation only right now.
    r   should_build_locallyF)r    	is_fbcoder%   
nvcc_exist_cuda_compilertriton.fb.re_build_helperr   r~   rd   rd   rf   use_re_build   s    r   strc                   C  s   t jjd u rdS dS )NZ
cubin_pathZ
hsaco_pathtorchversionhiprd   rd   rd   rf   get_cpp_wrapper_cubin_path_name   s    r   devicerb   c                 C  s0   | dkrt jjd u rdS dS | dkr(dS dS d S )Ncudacubinhsacoxpuspv r   r   rd   rd   rf   get_kernel_bin_format   s
    r   Optional[Path])global_cache_dirrb   c                 C  s&   | d ur"t tj| t d S d S )Nhash)r   ospathjoin	CacheBase
get_system)r   rd   rd   rf   get_global_cache_path_impl   s    r   c                   @  s~   e Zd ZeejddddZeeejddddZeddd	d
Z	ddddZ
ddddZdddddZdS )r   dict[str, Any]rh   c               	   C  s   zddl m}  |  }W n ty,   d }Y n0 zrdd id|id}tjtj }tjjd ur|j|d d< tjj|d d< n|j	|d d< tjj
|d d	< W n ttfy   i }Y n0 ttj|d
dd |d< |S )Nr   )
triton_keynametriton)r   r   r   r   r   r   T)	sort_keysutf-8r   )Ztriton.compiler.compilerr   ModuleNotFoundErrorr   r   Zget_device_propertiesZcurrent_devicer   r   ZgcnArchNamer   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   Ztriton_versionsystemZdevice_propertiesrd   rd   rf   r      s0    


zCacheBase.get_systemr   c                   C  s   t tjt dt d S )Ncacher   )r   r   r   r   r<   r   r   rd   rd   rd   rf   get_local_cache_path   s    zCacheBase.get_local_cache_pathr   c                   C  s
   t tjS rc   )r   r    r   rd   rd   rd   rf   get_global_cache_path   s    zCacheBase.get_global_cache_pathr^   c                 C  s   t  | _d S rc   )r   r   r   selfrd   rd   rf   __init__   s    zCacheBase.__init__c                 C  sN   |   }| si S t|}t|}W d    n1 s<0    Y  |d S Nr   )r   is_fileopenr   load)r   local_cache_pathZlocal_cache_fplocal_cacherd   rd   rf   get_local_cache   s    
(zCacheBase.get_local_cache)r   rb   c                 C  s0   |   }tt|tj| j|ddddd d S )N)r   r      )indentT	make_dirs)r   write_atomicr   r   r   r   )r   r   r   rd   rd   rf   update_local_cache  s    zCacheBase.update_local_cacheN)__name__
__module____qualname__staticmethod	functoolsr   r   r?   r   r   r   r   r   rd   rd   rd   rf   r      s   $r   c                   @  s.   e Zd ZdddddZddddd	d
ZdS )
LocalCacher   Optional[dict[str, Any]])keysrb   c                 G  s2   |   }|}|D ]}||v r&|| }q d S q|S rc   )r   )r   r   r   	sub_cachekeyrd   rd   rf   lookup  s    
zLocalCache.lookupr   r^   )r   valuerb   c                G  sL   |   }|}|dd D ]}||i  || }q|||d < | | d S )Nr   )r   
setdefaultr   )r   r   r   r   r   r   rd   rd   rf   	set_value  s    
zLocalCache.set_valueN)r   r   r   r   r   rd   rd   rd   rf   r     s   r   c                   @  s6   e Zd ZejddddZdddddd	d
dZdS )PersistentCacher   rh   c                 C  sV   |   }|d u s| si S t|}t|}W d    n1 sD0    Y  |d S r   )r   r   r   r   r   )r   Zglobal_cache_pathZglobal_cache_fpZglobal_cacherd   rd   rf   get_global_cache'  s    
(z PersistentCache.get_global_cachezlist[ChoiceCaller]r   z4Optional[Callable[[Any], dict[ChoiceCaller, float]]]zdict[ChoiceCaller, float])choicesopinputs	benchmarkrb   c              
     s  t  tt| j}tt| j}tt| j}i ddddd fdd}tjsptj	r~tj
r~|  ni }	||	st r||  |ds|durzv| tfd	d
 D sJ |	i  |	 i i   D ]"\}
}||	   |
 <  qW n2 tyV } z|| |W Y d}~n
d}~0 0 | |	 fdd D }|| nt r||  |d S )aG  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            3. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nr   r   rg   )r   callbackrb   c                   sj   d} D ]N}|  }|| i i i v rN|    | |< qd} qXq|rf||d |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)r   r   hitchoiceZchoice_hashr   r   r   Z	precisiontimingsrd   rf   check_cacheK  s     
z+PersistentCache.lookup.<locals>.check_cache)r   c                 3  s   | ]}| v V  qd S rc   rd   .0r   r   rd   rf   	<genexpr>i      z)PersistentCache.lookup.<locals>.<genexpr>c                   s   i | ]}|   | qS rd   )r   r   r   rd   rf   
<dictcomp>u  s   z*PersistentCache.lookup.<locals>.<dictcomp>)N)r   Zget_float32_matmul_precisionr
   r[   r   r\   rZ   r    Zmax_autotuneZmax_autotune_gemmZautotune_local_cacher   r]   r   allr   itemsr   r   r   )r   r   r   r   r   Z	log_statsZlog_valsZ
log_errorsr   r   r   ZtimingeZtimings_to_logrd   r   rf   r   0  sH      


zPersistentCache.lookupN)r   r   r   r   r   r   r   rd   rd   rd   rf   r   &  s   r   c                  C  s.   t jt d} t j| s*t j| dd | S )NlocksTexist_ok)r   r   r   r<   existsmakedirs)lock_dirrd   rd   rf   get_lock_dir  s    r   bytes)datarb   c                 C  s&   t t|  d d d S )N3   r   )base64	b32encoder   r   digestdecodelower)r   rd   rd   rf   sha256_hash  s    r   r   zUnion[str, bytes])codeextrarb   c                 C  sL   t | tr| n| d}|r@t |tr*|n|d}|d | }dt| S )Nr   s   ||c)
isinstancer   r   r   )r   r   Zhashing_strZextra_brd   rd   rf   	code_hash  s
    r   tuple[str, str, str])basename	extensionspecified_dirrb   c                 C  sb   |r(t j|r|}q@t jt |}nt jt | dd }t j||  d| }| ||fS )NrQ      .)r   r   isabsr   r<   )r   r   r   subdirr   rd   rd   rf   get_path  s    r  r   )contentr   	hash_typerb   c                 C  s8   |dv rt | |S |dv r&t t| S td| d S )N>   Zamdgcnr   Zptxr   >   r   r   r   zUnknown hash type )r   reprr   )r  r   r  rd   rd   rf   get_hash  s
    
r	  Optional[str]tuple[str, str])r  r   r   r  r   r   rb   c           	      C  sL   |d u rt |  ||}t|||\}}}tj|sDt|| dd ||fS )NTr   )r	  stripr  r   r   r   r   )	r  r   r   r  r   r   r   Z_subdirr   rd   rd   rf   write  s    r  )textrb   c                 C  s   t | dd S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtrQ   r  )r  rd   rd   rf   
write_text  s    r  F)path_r  r   encode_utf_8rb   c                 C  s   t |ttfsJ dt| }|r2|jjddd |jdt  dt	  d }t |tr`dnd}|j
||rrdnd d	}|| W d    n1 s0    Y  z|j|d
 W n0 ty   tsȂ tj||d t| Y n0 d S )Nz6Only strings and byte arrays can be saved in the cacheT)parentsr   r  z.tmpwwbr   encoding)target)srcdst)r   r   r   r   parentmkdirr   getpid	threading	get_identr   r  renameFileExistsError_IS_WINDOWSshutilcopy2remove)r  r  r   r  r   Ztmp_pathZ
write_modefrd   rd   rf   r     s"     (r   c                   @  s"   e Zd ZU dZded< ded< dS )TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    rE   Ztensor_metadata	list[Any]valuesNr   r   r   __doc____annotations__rd   rd   rd   rf   r(    s   
r(  )xrb   c                 C  s   | S rc   rd   r.  rd   rd   rf   _ident  s    r0  r   rE   trb   c                 C  s&   t | }t| ds"tj|ddd}|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    Z_is_inductor_staticr   N)Zstorage_offsetZstorage_bytes)rC   hasattrdataclassesreplace)r2  metard   rd   rf   %extract_tensor_metadata_for_cache_key  s    
r7  c                      s   e Zd ZdZd*dddd fddZd	d
dddZd	ddddZdddddZdddddZdddddZ	dddd d!Z
dd"dd#d$Zd%d&d'd(d)Z  ZS )+FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    Ftorch.fx.GraphModulerg   r^   )gmhas_user_defined_triton_kernelsrb   c                   s   t  | _t | j tj | _| jt	t
| jtjt
| jtjjjt
| jtjt
| jtjjjjt
| ji |rt
| j| j|j< d| _dS )a2  
        Create an FX graph pickler. If include_non_inlined=True, then pickling will
        include the _values_ for all Tensors. (Note that any tensors are constants
        attached as attributes to the GraphModule). Otherwise, pickling will include
        only the metadata for these tensors.
        TN)ioBytesIO_streamsuperr   copyregdispatch_tabler   updaterD   r   r
   _reduce_fake_tensorr   r   _reduce_tensornnZ	parameter	Parameterr   _reduce_symintfxZexperimentalZ_backward_stateZBackwardState_reduce_unsupported_reduce_graph_module	__class__fast)r   r:  r;  rK  rd   rf   r   	  s$    
zFxGraphCachePickler.__init__r   z.tuple[Callable[[T], T], tuple[TensorMetadata]]r1  c                 C  s   t |}t|ffS )z7
        Custom reducer to pickle FakeTensors.
        )r7  r0  )r   r2  metadatard   rd   rf   rC  -  s    z'FxGraphCachePickler._reduce_fake_tensorzNtuple[Callable[[T], T], tuple[Union[TensorMetadata, TensorMetadataAndValues]]]c                 C  s   ddl m} |jrtdt|}t|r>||s>t|ffS t }|	 }t | }|dkrrt
d|dd tt||ffS )z
        Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
        stored as attributes on the GraphModule.
        rQ   rq   zmkldnn tensors unpickleableg      ?z0FX graph cache copying of a large constant took z.1zs. Please file an issue.)graphrr   	is_mkldnnBypassFxGraphCacher7  r:   Zcan_inline_constantr0  r   tolistwarningswarnr(  )r   r2  rr   rN  startr*  elapsedrd   rd   rf   rD  6  s    

z"FxGraphCachePickler._reduce_tensorr   z#tuple[Callable[[T], T], tuple[str]])srb   c                 C  s   t t|ffS )z3
        Custom reducer to pickle SymInts.
        )r0  r   r   rW  rd   rd   rf   rG  X  s    z"FxGraphCachePickler._reduce_symintr   r   c                 C  s   t ddS )z{
        Custom reducer to handle any objects that we don't support and therefore
        raise to bypass caching.
        zReduce unsupportedN)rQ  rX  rd   rd   rf   rI  a  s    z'FxGraphCachePickler._reduce_unsupportedz&tuple[Any, tuple[dict[str, Any], str]]r:  rb   c                 C  sH   |  \}\}}|d }tdd|}tdd|}||d< |||ffS )a  
        Custom reducer for graph module to handle irrelevant data for user
        defined triton kernels
        Essentially what we are doing here is a huge hack where user defined
        triton kernel contain a dynamo time side table and the arguments to the
        call_function are indices into this side table. These arguments are not
        for hashing purposes since we included the source code into the cache
        key and the numbers are prone to give false negatives due to ordering.
        _codezkernel_idx = \d+r   zconstant_args_idx = \d+)
__reduce__resub)r   r:  fnr   Zimportsr   rd   rd   rf   rJ  h  s    z(FxGraphCachePickler._reduce_graph_moduler   objrb   c              
   C  s   zz0|  | | j W W | jd | jd S  ttfyr } z$tjddd t	d|W Y d}~n
d}~0 0 W | jd | jd n| jd | jd 0 dS )z<
        Pickle an object and return a byte string.
        r   zFailed to pickle cache keyTexc_infoN)
dumpr>  getvalueseektruncate	TypeErrorAttributeErrorlogwarningrQ  )r   r`  r   rd   rd   rf   r   {  s    
"zFxGraphCachePickler.dumpsr   c                 C  s   |  |}t|S )zE
        Serialize an object and return a hash of the bytes.
        )r   r   )r   r`  Zserialized_datard   rd   rf   r	    s    
zFxGraphCachePickler.get_hashFxGraphHashDetails	list[str])inprb   c           
        s   ddd fdd}g }t | D ]\}}t|trtt|D ]<} || }|d| d| d| d|||   qBq$t|tr| D ]8\}}	 |	}|d| d| d| d||	  qq$ |}|d| d| d	||  q$|S )
z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        r   r   r_  c                   sZ   t | tjrtt| S t | tr&dS t|  jv rNt jt|  | d S t| S d S )Nz<bytes>rQ   )r   r   r   r   r7  r   typerA  )r`  r   rd   rf   get_str  s    
z0FxGraphCachePickler.debug_lines.<locals>.get_str[z] z]: z: )	varsr   r   listrangelenr	  appenddict)
r   rm  ro  linesattrr`  iihkvrd   r   rf   debug_lines  s    
.

*
"zFxGraphCachePickler.debug_lines)F)r   r   r   r,  r   rC  rD  rG  rI  rJ  r   r	  r}  __classcell__rd   rd   rM  rf   r8    s   
 $	"	r8  zlist[str] | Nonezhashlib._Hash)rootsprefixhasherrb   c              	   C  s   t t| |dd dD ]}|j|jd }|d us8J |j}|d usJJ t|d0}||j	d ||
  W d    n1 s0    Y  |jrt|j|j d| qd S )Nc                 S  s   | j S rc   )r   r/  rd   rd   rf   <lambda>  r   z!build_code_hash.<locals>.<lambda>)r   rbr   r  )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   rB  r   readispkgbuild_code_hashsubmodule_search_locations)r  r  r  libspecmoduler'  rd   rd   rf   r    s    ,r  zCallable[[], bytes])funcrb   c                   sN   g  dd fdd}ddd fdd}dd fd	d
}||_ ||_|S )z
    This function is a reimplementation of functools.lru_cache with a
    set function that allows prepopulating the cache.
    r   rh   c                     s    t  dkr    d S Nr   rt  ru  rd   _cacher  rd   rf   wrapper  s    z torch_key_cache.<locals>.wrapperr^   )valrb   c                   s   t  dksJ  |  d S r  r  )r  r  rd   rf   set_val  s    z torch_key_cache.<locals>.set_valc                     s       d S rc   )clearrd   r  rd   rf   r    s    ztorch_key_cache.<locals>.clear)setr  )r  r  r  r  rd   r  rf   torch_key_cache  s    r  c                  C  s~   t ddd^ t s<ddddd} | tW  d	   S d
dlm} |d dW  d	   S 1 sp0    Y  d	S )zS
    Compute a key that contains relevant information about torch source files
    Zinductor_codecache_torch_keyF)Zlog_pt2_compile_eventr   r   )rootrb   c              	     s   d}t jt  fdd|D }t }|tj	d t
| gd| |D ]H}t j|rNt|d}||  W d    qN1 s0    Y  qN| S )N)z"codegen/aoti_runtime/interface.cppz	script.ldc                   s   g | ]}t j |qS rd   )r   r   r   r   r.  Zinductor_rootrd   rf   
<listcomp>  r   z4torch_key.<locals>.get_code_hash.<locals>.<listcomp>r   r   r  )r   r   dirname__file__r   r   rB  r   __version__r   r  r   r   r  r   )r  Zextra_filesr  r   r'  rd   r  rf   get_code_hash  s    .z torch_key.<locals>.get_code_hashNr   parutilztorch/src_hash.txtascii)	r   r    r   r+   libfb.pyr  Zget_file_contentsrstripr   )r  r  rd   rd   rf   	torch_key  s    r  c                   C  s   t jtS rc   )r   r   r  r  rd   rd   rd   rf   get_inductor_root  s    r  c                   @  s   e Zd ZU dZded< dS )OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    r)  r   Nr+  rd   rd   rd   rf   r    s   
r  c                   @  s   e Zd ZdZdS )rQ  zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   r,  rd   rd   rd   rf   rQ    s   rQ  c                   @  sL   e Zd ZdZdgZddddddd	d
ZdddddZdddddZdS )rk  zz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    Zgraph_idr9  Sequence[InputType]ro   Sequence[int]r^   r:  example_inputs	fx_kwargsinputs_to_checkrb   c                 C  sJ  || _ || _tj| _i | _t| D ]@\}}|| jvr&t|t	t
fv r\tt|| j|< q&|| j|< q&ddlm}m}m}	 ddlm}
 g | _|d ur\| D ]}t|tjjsqt|jjd|d|jjd|	dD ]}ddlm} ||jd }d }t||r.|j r(t!tdd	 |j D }|j"}|
|}|#|jd
 }| j$|||f qq|| _%t&dd	 |D  }|rtj'( rtj') | _*t+ t, tj-j.j/f| _0tj1j2j3j4tj1j2j3j5tj1j2j3j6f| _7t8 | _9t:; | _<t=j>dd| _?| @t=jA| _A| @t=jB| _B| Ct=jD| _D| Ct=jE| _EtF  tGtH| j@tIJ | _Id S )Nr   )kernel_side_table triton_kernel_wrapper_functionaltriton_kernel_wrapper_mutation)9user_defined_triton_kernel_transitive_closure_source_codecall_function)r   r  )	AutotunerZ
kernel_idxc                 s  s(   | ] }t d d |  D V  qdS )c                 s  s   | ]}t |V  qd S rc   )r   )r   kvrd   rd   rf   r   T  r   z8FxGraphHashDetails.__init__.<locals>.<genexpr>.<genexpr>N)r  Z
all_kwargsr   )r   r   rd   rd   rf   r   S  s   z.FxGraphHashDetails.__init__.<locals>.<genexpr>Zconstant_args_idxc                 s  s   | ]}t |tjV  qd S rc   )r   r   r   r  rd   rd   rf   r   i  r   F)Zignore_private_configs)Kr:  r  cconfigZcache_key_tagr  r  r   EXCLUDED_KWARGSrn  r  rP   r  Z*torch._higher_order_ops.triton_kernel_wrapr  r  r  Ztorch._inductor.codegen.wrapperr  user_defined_triton_sourcemodulesr   r   rH  GraphModule	itertoolschainrO  Z
find_nodesZtriton.runtime.autotunerr  Z
get_kernelra   configsr   r^  Zget_constant_argsru  r  anyZacceleratoris_availableZcurrent_device_indexZdefault_cuda_device_indexZ$are_deterministic_algorithms_enabledZ-is_deterministic_algorithms_warn_only_enabledutilsZdeterministicZfill_uninitialized_memoryZ!deterministic_algorithms_settingsbackendsr   matmulZ
allow_tf32Z&allow_fp16_reduced_precision_reductionZ&allow_bf16_reduced_precision_reductionZcuda_matmul_settingsr  Ztorch_versionr   r   Zsystem_infor    Zsave_config_portableZinductor_config_get_custom_pass_detailpost_grad_custom_pre_passpost_grad_custom_post_pass_get_custom_pass_detail_unsafe_pre_fusion_custom_pass_fuse_ddp_communication_passesr$   tuplemapr#   r*  )r   r:  r  r  r  r{  r|  r  r  r  r  r  noder  kernelr  Zkernel_sourceZconstant_argsZno_tensor_inputsrd   rd   rf   r     s    





zFxGraphHashDetails.__init__r   Optional[Any])custom_passrb   c                   sj   |sd S t |tr$ fdd|D S t |tr2|S t |trD| S t|rPd S tdtt| d S )Nc                   s   g | ]}  |qS rd   )r  r  r   rd   rf   r    r   zEFxGraphHashDetails._get_custom_pass_detail_unsafe.<locals>.<listcomp>zunknown config type: )r   rr  r   r7   uuidcallabler   rn  r   r  rd   r   rf   r    s    


z1FxGraphHashDetails._get_custom_pass_detail_unsafez1Union[CustomGraphPassType, CustomGraphModulePass]c                 C  s"   |sd S t |ttfsJ | S rc   )r   r7   r6   r  r  rd   rd   rf   r    s    z*FxGraphHashDetails._get_custom_pass_detailN)r   r   r   r,  r  r   r  r  rd   rd   rd   rf   rk    s    rk  r9  r  ro   r  ztuple[str, list[str]]r  c           
      C  sf   t | |||}t|jdk}t| |}d|| }||}d|}	td| d|	  ||fS )z=
    Generate a unique hash of the FX graph for caching.
    r   r'  
z$FX graph cache hash details for key z:
)	rk  rt  r  r8  r	  r}  r   ri  debug)
r:  r  r  r  detailsr;  Zpicklerr   r}  Z	debug_strrd   rd   rf   compiled_fx_graph_hash  s    	


r  int)time_saved_nsrb   c                 C  s|   t j rt j sdS t| d }t rZt jd}t	
d|| |t|| d 7 }t	
d| tjt|d |S )z}
    Ephemerally increases the NCCL timeout when compiling for a distributed job
    Returns amount of seconds increased
    r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)r   distributedr  Zis_initializedr  r    r   _utils_internalZjustknobs_getval_intri  infodistZdistributed_c10dZ"_add_ephemeral_timeout_for_all_pgsr   )r  Zincreased_timeout_secZfudge_factorrd   rd   rf   .add_ephemeral_timeout_increase_for_distributed  s$    r  c                	   @  s   e Zd ZdZeddddddZeddddd	d
ddZeddddddddddZeddddddZedddddZ	dS )GuardedCachezJ
    Mixin for caches that have guards associated with their entries.
    ztype[GuardedCache[T]]r   )cls_keyrb   c                 C  s   t dd S )Nz.Implement _get_tmp_dir_for_key on parent classNotImplementedError)r  r  rd   rd   rf   _get_tmp_dir_for_key  s    z!GuardedCache._get_tmp_dir_for_keyrg   !Optional[RemoteCache[JsonDataTy]]z&Generator[tuple[T, bytes], None, None])r  localremote_cacher   rb   c           
   	   c  s*  |r|  |}tj|rtt|D ]v}zPttj||d(}| }t	
||fV  W d    n1 sp0    Y  W q( ty   tjddd Y q(0 q(|r&zX|| }d urt|tsJ |d }	t|	ttfsJ t|	}t	
||fV  W n& ty$   tjd| jdd Y n0 d S )Nr  z,fx graph cache unable to load compiled graphTra  r   z %s unable to load compiled graph)r  r   r   r   r  listdirr   r   r  pickleloads	Exceptionri  rj  r   r   rv  r   r   r   	b64decoder   )
r  r  r  r   r  r   r'  r  
cache_datar   rd   rd   rf   iterate_over_candidates  s2    
2
z$GuardedCache.iterate_over_candidatesz;Callable[[str, Union[list[int], list[torch.SymInt]]], bool]z	list[int]z3tuple[Optional[T], Optional[bytes], dict[str, str]])r  r   r  r  evaluate_guardshintsrb   c                 C  s   d}d}d}d}	|  |||D ]b\}
}t|
ds4J |
jsJ|
}|}d} qt||
j|}|rv|
}|}d}|
j}	 qqd}|
j}	qd|i}|	dur|	|d< |||fS )aY  
        Find the first cache entry in iterate_over_candidates that passes `evaluate_guards`.

        Args:
            key: The cache key to look up
            local: Whether to check the local cache
            remote_cache: The remote cache to check, if any
            evaluate_guards: Function that evaluates whether a guard passes the check,
                given a list of hint values and the guard expression.
            hints: List of symint hints paired with evaluate_guards

        Returns:
            A tuple of (graph, pickled_content) if found, or (None, None) if not found
        NZ	full_missguards_exprr   Z
guard_missZcache_status_detailedZcache_status_guard_expr)r  r3  r  rg   )r  r   r  r  r  r  rO  pickled_contentZresult_statusZsample_guards_expr	candidater  r   r  rd   rd   rf   find_guarded_entry  s0    zGuardedCache.find_guarded_entryr  zlist[torch.SymInt])r  r   rb   c                 C  s   dd |D S )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        c                 S  s$   g | ]}t |tjrt|r|qS rd   )r   r   r   rM   r   rW  rd   rd   rf   r  Y  r   z7GuardedCache._filter_backed_symints.<locals>.<listcomp>rd   )r  r   rd   rd   rf   _filter_backed_symintsQ  s    z#GuardedCache._filter_backed_symintszOptional[ShapeEnv])r  rb   c                 C  s   t jj }|sdS |jjS )zG
        Helper to get the shape env from the tracing context.
        N)r   Z_guardsZTracingContextZtry_getZ	fake_mode	shape_env)r  ctxrd   rd   rf   _get_shape_env[  s    zGuardedCache._get_shape_envN)
r   r   r   r,  classmethodr  r  r  r  r  rd   rd   rd   rf   r    s   !<	r  c                   @  s4   e Zd ZeddddZeeddddZdS )	InductorCacheArtifactr^   rh   c                 C  s   t | j| j d S rc   )FxGraphCache_write_to_local_cacher   r  r   rd   rd   rf   populate_cacheh  s    z$InductorCacheArtifact.populate_cacher   c                   C  s   dS )Ninductorrd   rd   rd   rd   rf   rn  l  s    zInductorCacheArtifact.typeN)r   r   r   r   r  r   rn  rd   rd   rd   rf   r  f  s
   r  c                   @  s.  e Zd ZdZeddddZedddddd	Zed
ddddddZed6ddddddddddZ	eddddddZ
edddddddd d!Zed"dd#d$d%Zed"dd#d&d'Zed"dd(d)dd*d+d,d-Zeddd.d/Zed7dd0dddddddd1	d2d3Zeddd4d5ZdS )8r   a6  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metadata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    r   rh   c                   C  s   t jt dS )zS
        Get the toplevel temporary directory for storing compiled graphs.
        Zfxgraph)r   r   r   r<   rd   rd   rd   rf   _get_tmp_dir  s    zFxGraphCache._get_tmp_dirztype[FxGraphCache])r  r   rb   c                 C  s   t jt |dd |S )zA
        Return the disk location for a given cache key.
        rQ   r  )r   r   r   r   r  r  r   rd   rd   rf   r    s    z!FxGraphCache._get_tmp_dir_for_keyrS   r   rt   z0tuple[Optional[CompiledFxGraph], dict[str, Any]])rO  
cache_info	constantsrb   c                   s\  j  }rht|}| }durht||d< tjd|jd tjd|jd t|jdkrhttj	d z0
| dd	lm} |jdur|j W n ty   d|f Y S 0 t }jtj|d
 tjj td  j7  < td td  tddd fddd tdfddd td fddfddd |fS )ah  
        Cache specific post compile steps that need to run if we find a graph in the cache
        This includes putting bundled triton artifacts in the right place,
        reloading the PyCodeCache artifact, etc.

        These don't always happen (i.e. on a cache miss, so they are in a separate function from
        CompiledFxGraph.post_compile)
        Ntriton_bundler_metaZinductor_compile)cached_kernel_nameszAOTAutogradCache.inductor_loadr   Znum_triton_bundlesrQ   rq   r   r  zOutput code: 
%szOutput code written to: %sZartifactc                   S  s
   dddS )NZfx_graph_runnablestring)r   r  rd   rd   rd   rd   rf   r    s    z5FxGraphCache.cache_hit_post_compile.<locals>.<lambda>c                     s    j S rc   )Zrunnable_graph_strrd   rO  rd   rf   r    r   )Zmetadata_fn
payload_fnZinductor_post_grad_graphc                     s    j S rc   )Zinductor_post_grad_graph_strrd   r  rd   rf   r    r   r  Zinductor_output_codec                     s   d iS )Nfilenamerd   rd   )artifact_pathrd   rf   r    r   c                     s    S rc   rd   rd   r
  rd   rf   r    r   )Z_triton_bundlerW   Zread_and_emitr   r   Ztry_add_pt2_compiler	  rt  try_increment_toplevelZafter_deserializationrO  rr   Zsave_output_codesource_codeOSErrorrU   Zinductor_meta_from_configrV   Zbegin_compiler"   ZCachedMetricsHelperZapply_deltasZmetrics_deltasr   Zcounter_deltasoutput_code_logr  rB   )rO  r  r  Zbundler  r6  rr   Zinductor_metard   )r  r   rO  rf   cache_hit_post_compile  sX    







z#FxGraphCache.cache_hit_post_compileNr  rg   r  zEOptional[Callable[[str, Union[list[int], list[torch.SymInt]]], bool]])r   r  r  r  r  r  rb   c                 C  s   t  }|dusJ t |}dd |D }tjr:dd }|du rH|j}t }	t | ||||\}
}}|	| |
du rd|	fS |durt	
t | | |
jrt||
j|}|du sJ td| |j t |
|	|S )a  
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        `constants` tracks a list of constants, or a way to obtain the list of constants
        associated with a given cache entry
        `evaluate_guards` allows AOTAutogradCache and other callers to customize
        what constitutes a guard success. Normally, a guard hit happens if
        `shape_env.evaluate_guards_expression` returns True.
        Nc                 S  s   g | ]}t |qS rd   )rN   r  rd   rd   rf   r    r   z.FxGraphCache._lookup_graph.<locals>.<listcomp>c                 S  s   dS )NTrd   )r.  yrd   rd   rf   r    r   z,FxGraphCache._lookup_graph.<locals>.<lambda>Tz*fx graph cache key %s post-load guards: %s)r   r  r  r    Z&unsafe_skip_cache_dynamic_shape_guardsZevaluate_guards_expressionrv  r  rB  rI   record_artifactr  rn  r  rg   ri  r  guardsr  )r   r  r  r  r  r  r  symintsr  r  rO  r  Z
guard_infocheckrd   rd   rf   _lookup_graph  s4    




zFxGraphCache._lookup_graphr   r^   )r   r  rb   c                 C  sH   t | }tj|s$tj|dd tj|t|}t||dd d S )NTr   r   )	r   r  r   r   r   r   r   r   r   )r   r  r  r   rd   rd   rf   r  &  s
    
z"FxGraphCache._write_to_local_cacheru   )r   compiled_graphr  r  r  rb   c                 C  sT  ddl m} t||s*J dt| dt }|dus>J t|}||}|j||d|_	t
|}	|	  zt|	}
W n6 ty   tjddd	 td
 d  d7  < Y dS 0 z`tt | |
 |rt| |
 |rt|	jpdd }t|
d|d}|| | W n6 tyN   tjddd	 td
 d  d7  < Y n0 dS )z=
        Store a serialized CompiledFxGraph on disk.
        rQ   rR   zserialization for z NYIN)Zplaceholdersr  z1fx graph cache unable to serialize compiled graphTra  r  Zfxgraph_cache_pickle_errorr   g    .Ar  )r   time_taken_msz!fx graph unable to write to cacheZfxgraph_cache_write_error)
compile_fxrS   r   rn  r   r  r  Zget_pruned_guardsZproduce_guards_expressionr  r   Zprepare_for_serializationr  r   r  ri  rj  r   rI   r  r  r  r  _time_taken_nsr   	b64encoder   put)r   r  r  r  r  rS   r  r  r  Zdisk_compiled_graphr  r  r  rd   rd   rf   _save_graph2  sH    	


zFxGraphCache._save_graphr9  rY  c                 C  s   |   D ]x}t|tjjsq|jjD ]Z}t|jtjj	rV|j
 sVtd|j  |jdkr$tt| |jtjjr$tdq$qd S )Nz!Can't cache HigherOrderOperator: getattrzCan't cache torchbind objects)r  r   r   rH  r  rO  nodesr  Z_opsZHigherOrderOperator	cacheablerQ  r   r   r$  _CScriptObject)r:  r  r  rd   rd   rf   _check_for_hopm  s    zFxGraphCache._check_for_hopc                 C  s   t jt jfD ]"}|rt|tr&| stdqt jdurNtt jtsNtdt jD ]}t	|rTt|tsTtdqTt
| rtjdstdt jjrtddd	lm} |jrtd
 tt du rtd tdt|  dS )z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        z!Unsupported post grad custom passNz#Unsupported _pre_fusion_custom_passz(Unsupported _fuse_ddp_communication_passz,pytorch/inductor:allow_freezing_with_cachingz$Skipping graph with frozen constantszORuntime constant folding can introduce constants that aren't static across runsr   )CompilerBisectorz$dont cache graph when bisect enabledzfx graph cache no shape envzNo shape env)r    r  r  r   r7   r  rQ  r  r  r  r9   r   r  Zjustknobs_checkaot_inductorZuse_runtime_constant_foldingZ!torch._inductor.compiler_bisectorr*  Zbisection_enabledri  r  r   r  r)  )r:  pr*  rd   rd   rf   _check_can_cache  s2    





zFxGraphCache._check_can_cachero   r  z6tuple[Optional[tuple[str, list[str]]], dict[str, Any]])r:  r  r  r  remoterb   c           	   
   C  s   z t |  t| |||\}}W np ty } zXtd d  d7  < td| |rbtdt| dt|t	 d}d|fW  Y d}~S d}~0 0 ||fi fS )	a  
        Checks that the inductor input is cacheable, then computes
        and returns the cache key for the input.
        Returns (key_info, cache_info) where:
        - key_info is (hash_key, debug_lines), and
        - cache_info will contain debug info in the event of BypassFxGraphCache.

        NB: It is possible to have this function return a union instead. But
        I personally believe it is more annoying/difficult to read in that format.
        r  Zfxgraph_cache_bypassrQ   z%Bypassing FX Graph Cache because '%s'Zbypass_fx_graphbypass)cache_stateZcache_bypass_reasoncache_event_timeN)
r   r-  r  rQ  r   ri  r  rF   r   r   )	r:  r  r  r  r.  r   r}  r   r  rd   rd   rf   prepare_key  s     
zFxGraphCache.prepare_keyc                  C  s   d} t | t ddS )zK
        Attempts to load the remote cache, returns None on error.
        zfx-graph-v1ZFbRemoteFxGraphCacheZRemoteFxGraphCache)rT   r    r   )Zcache_idrd   rd   rf   get_remote_cache  s    zFxGraphCache.get_remote_cacherl  )	r   r}  r  r  r  is_backwardr  r  rb   c                 C  s  t | |||||\}}	i |	| |t d}	|durtd|  td d  d7  < d|	d< |rttjd	 ttj	d
|  |j
 }
dur|
|	d< ttjd|
d  t|
 }dkr||	d< nJ|rttjd ttj	d|  td|  td d  d7  < d|	d< ||	fS )z
        Lookup the graph with the given key, and return results and metadata.
        Doesn't do any logging on its own, because AOTAutograd handles a cache miss
        differently from FXGraphCache.
        )r   
componentsr1  Nzfx graph cache hit for key %sr  Zfxgraph_cache_hitrQ   r   r0  Z"inductor_fx_remote_cache_hit_countZ!inductor_fx_remote_cache_hit_keysr  Z distributed_ephemeral_timeout_usi  r   Zephemeral_timeout_increaseZ#inductor_fx_remote_cache_miss_countZ"inductor_fx_remote_cache_miss_keyszfx graph cache miss for key %sZfxgraph_cache_missZmiss)r   r  r   ri  r  r   r   r  r  Zadd_to_set_toplevelr   r  )r   r}  r  r  r  r4  r  r  r  r  r  Zephemeral_increaserd   rd   rf   load_with_key  sd    
zFxGraphCache.load_with_keyc                   C  s*   zt t  W n ty$   Y n0 dS )z.
        Clear out the on-disk cache.
        N)r$  rmtreer   r  FileNotFoundErrorrd   rd   rd   rf   r  )  s    zFxGraphCache.clear)N)N)r   r   r   r,  r   r  r  r  r  r  r  r#  r)  r-  r2  r3  r6  r  rd   rd   rd   rf   r   r  s6   K	 ;:.$ "Hr   )r   rb   c                 C  s8   |  drtj| S |  dr,tj| S | dfS dS )zDReturns the path where the AOT Inductor compiled kernels are stored..soz.pt2r   N)endswithr   r   split)r   rd   rd   rf   split_aot_inductor_output_path4  s
    

r<  c                
   @  sj   e Zd ZU i Zded< eejZedddddddddd	d
Z	edddddZ
eddddZdS )CudaKernelParamCachezdict[str, dict[str, Any]]r   Nr   zdict[str, Optional[str]]r
  r^   )r   paramsr   bin_typeasmasm_typerb   c                 C  s  d }t jjr4t jjsJ d|d s,J d|d }t|||tt jjd |d\}}	t|	\}}t jj	rddd}
||

 v sJ d	tj|	\}}||
|  }	d
}t jj	st jjr|sJ d|sJ dt|||tt jjd |d\}}|	|t < ||d< || j|< d S )Nz:package_cpp_only requires triton kernel names to be uniqueZmangled_namezMissing kernel namer   )r  r   r   z.fatbinz.spv)r   r   z3multi_arch_kernel_binary only supported in CUDA/XPUr   zMissing kernel assembly codezMissing kernel assembly typer@  )r    r+  package_cpp_onlyr   Zunique_kernel_namesr  r<  output_pathr3   emit_multi_arch_kernelr   r   r   splitextr   r   )r  r   r>  r   r?  r@  rA  r   _Zbin_pathZbin_type_to_ext	base_pathZasm_pathrd   rd   rf   r  D  s^    






zCudaKernelParamCache.setr   )r   rb   c                 C  s   | j |d S rc   )r   r   r  rd   rd   rf   r     s    zCudaKernelParamCache.getzKeysView[str]rh   c                 C  s
   | j  S rc   )r   r   )r  rd   rd   rf   get_keys  s    zCudaKernelParamCache.get_keys)NN)r   r   r   r   r-  r   r  cache_clearr  r  r   rH  rd   rd   rd   rf   r=  ?  s   

  ;r=  c                	   @  s.   e Zd ZdZedddddddddd	Zd
S )AotCodeCompilerz.
    Compile AOT Inductor generated code.
    rr   r   r
  rl  z%Union[list[Union[str, Weights]], str])rO  wrapper_codekernel_codeserialized_extern_kernel_nodesdevice_typeadditional_filesrb   c          W        s  |}t jdkrtdt  t }tddt|jdd}	t|		 }
t
 o\dko\jtt
jj\}}t
jjrdfd	td
|
|t
jjd\}	d	 d td|
|t
jjd\}td<}|ddf |  tjj|jdd W d   n1 s0    Y  t
jjrL|	 t
jjsL| td	 td td	fddfddd tdfddfddd t 	}t }|j!| " sЈj#dd t$t d }dddd fd!d"}d#d$l%m&} t' }|t(j)||d% t*d&}|	B |rt$|+d'}t,|d(}|| W d   n1 sp0    Y  t
jjr|| t
jj-}|d)< t$|.|j/ d*}t
jj-0 D ]*\}}t1|t$rt1|t$sJ d+qt,|d($}|t23t
jj- W d   n1 s"0    Y  t$|.|j/ d*}t45|| t
jjrv|| t
jjsv|| |rt
jjnt$|+d,}t6fd-d.j78 D d/d0dd1d2d3 t
jj9rd4 fd5d.j78 D }nd4}t
jj:r"t;fd6d7j78 D } ||  t<|}!t
  o<|!d8k}"t
jj=rLd}"j|"|d9}#tf dt
jj d:|#}$tf d;di|#}%t
jj>rt?st@jd<}&tA|&|
fd=t
jj i|#|$_BtC }'rtA|'|
fi |#|%_Btt$|j/	t$|j!|$d>}(|(	 })|(D }*tt$|j/t$|j!|%d>}+|+	 },|+D }-tEd?|) tEd@|, t
jjrt$|.|j/ dA}.|$F|. ||. |(G| |(H|	 || n^z|(I  W nH tJjKtLfy }/ z(dBt$|/v rtdC|/|/W Y d}/~/n
d}/~/0 0 |+I  |"s|}0d#}1n4tMtNtOPd#tOQtOjRjSdDT }1tUVdE|!dF |1}0||0t j}2d#}3i }4tWjX0 D ]\}3\}5}6t1|6tOjYjZj[r|6j\}6t1|6tOj]j^sJ t_ |3 }7tEdG|5|7 |7|4|5< tOj]`|6}8t(j)|j!|7}9ta|9|8d ||9 qt|4r`t(j)|j!dH}:t,|:d( }|t23|4 W d   n1 sL0    Y  ||: tOjbjcrptd nte };|;jf5 }<|;jfg  |<rt
jjhrJ dIg }=g }>ti\}?}@tjjk0 D ]\}A}B|BdJ  }Cr|>|C |Btl  }Dt
jjhrJdKkrJtm }Etn  dL|C dM|D dN|E dO|E dN|E dP|E dQ}Ftojp|Fq ddddR t
jjrr|=ts|D|A|?|@ qtt|\}G}Ht|jdS}I|*|-|2g|<|=}Jt|G|J|H|Id>}K|K	 }L|KD }tEdT|L t,	dU>}|d |dV|) d |dW|L d W d   n1 	s0    Y  t,dU>}|d |dV|, d |dW|L d W d   n1 	sr0    Y  t
jj
rt$|.|j/ dX}M|IF|M ||M |tu |"
r*t$|.|j/ dY}Nt,|NdZ,}O|O| |OtUVd[|1 W d   n1 
s0    Y  ||N n||2 |KH||2 t
jjh
rb|Kv||> |w|> n.g |<|=}J|w|J |JD ]}P|KH||P 
q||Kx| n|KI  |JD ]}Q|Q|<v 
r
qt(y|Q 
q|"rNd#dlz}R|R{ }StSd\|S}Tt,|d]J}U|U| }V|Ud^|T|V|T    |U| |UtUVd[|1 W d   n1 sD0    Y  t
jjrb|| W d   n1 sx0    Y  t
jjr|S |S )_z
        Returns the .so path, or returns a list of files that were generated if
        config.aot_inductor.package=True.
        r|   z.AotCodeCompiler not yet supported for inductoroi)vec_isarN  aot_moder   sourcesBuildOptioncpur  r   zwrapper.cpp)r   r   r   z.// Triton kernels are embedded as comments in z
kernel.cppzw+cpp)r   NzWrapper code written to: %szKernel code written to: %sZ
graph_dumpc                     s   dd dS )NZinductor_aot_wrapper_coderX  r   rn  r  rd   rd   )wrapper_pathrd   rf   r    s    z)AotCodeCompiler.compile.<locals>.<lambda>c                     s    S rc   rd   rd   )rK  rd   rf   r    r   r  c                     s   dd dS )NZinductor_aot_kernel_coderX  rY  rd   rd   )kernel_pathrd   rf   r    s    c                     s    S rc   rd   rd   )rL  rd   rf   r    r   Tr   zCMakeLists.txtr   r   )constsplatformrb   c                   s$  |dkr@j tj @ r6t| dkr0tdd}nd}d}n |dkrRd}d	}ntd
| t| dk}d| d}|dt d7 }|d| d7 }|| d7 }|s| D ]}|d| d7 }q| s|d7 }n |d7 }|dt| d  d7 }|d| d7 }|| d7 }t|dt	d\}}t
|}t dkr> ndjdd}	tt	|jt	|t	|j|	d }
|
 }|
  |rt|d!v}|d" |d}|d#}|d$ksJ || d"}|t| k r|| |d  }||7 }qW d    n1 s0    Y  t| |S )%Nlinux 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z.ldata, "aw"z.lrodata, "a"r   darwinz__DATA,__datarF  zUnsupported platform: i   z
	.section	r  z		.balign z	.globl	z_binary_constants_bin_start
z_binary_constants_bin_start:
z	.byte z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    z.globl	z_binary_constants_bin_end
z_binary_constants_bin_end:
S)r   r   rW  T)rN  rS  compile_onlyuse_relative_pathr   rU  
output_dirrV  zr+br   s   ͫxV4r   )Zmutated_buffersrP   r  r   rt  
ValueErrorr   r>   r  r   r   r0   rS  r.   stemr  get_target_file_pathbuildr   re  r  findr   r&  )r\  r]  Zsection_attrZsymbol_prefixZis_large_constsZ
consts_asmr   rF  Zconsts_sZobject_build_optionsZobject_builderconsts_or'  hdrZ	start_idxposrc)rN  rO  specified_sub_dirrd  rd   rf   _compile_consts  sx    





,
z0AotCodeCompiler.compile.<locals>._compile_constsr   FileLock.locktimeoutz.jsonr  ZAOTI_DEVICE_KEYz_metadata.jsonz"Metadata must only contain stringsr9  c                 3  s$   | ]}| j vr |jV  qd S rc   )folded_constantsget_original_value_of_constantis_cudar   r   r  rd   rf   r     s   
z*AotCodeCompiler.compile.<locals>.<genexpr>ztorch.Tensorrg   )r2  all_cudarb   c           	      S  s   ddddd}dd l }|  dkr(dS | jrLtjj| }tjj| }n|  	 }| }|
 }||||j| }t|j}|r|S ||S )Nr   )	raw_bytesrb   c                 S  s$   |  t| t d t t d}|S )NrQ       )ljustrt  r>   )r|  Zpadded_bytesrd   rd   rf   _pad_to_alignment  s
    zEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentr   r   )ctypesZnumelrP  r   opsZmkldnndata_ptrZ_nbytesZuntyped_storagerW  nbytesr   POINTERc_ubyter   contents)	r2  r{  r  r  r  r  Zt_cpuZ	raw_arrayr|  rd   rd   rf   	_to_bytes  s     	
z*AotCodeCompiler.compile.<locals>._to_bytesr   c                 3  s(   | ] }|j vr |V  qd S rc   )rw  rx  rz  )r  r{  rO  rd   rf   r     s   
c                   s6   i | ].}| j vr j|  |t j| fqS rd   )rw  Zallocated_constant_namerx  rJ   r  rz  r  rd   rf   r     s   
z+AotCodeCompiler.compile.<locals>.<dictcomp>r_  )rS  rN  use_mmap_weightsrd  rR  rc  min_optimizerc  )rS  r  re  z#aot wrapper compilation command: %sz"aot kernel compilation command: %sz_compile_flags.jsonz is too big to optimizezUPlease use torch._inductor.config.aot_inductor.compile_wrapper_opt_level = 'O0' flag.)rQ   Zqqra  zsaving script object %s as %szcustom_objs_config.jsonz<TODO: add emit_multi_arch_kernel support for cutlass kernelsr@  r   z	 -fatbin  -o z -gencode arch=compute_z,code=compute_z	,code=sm_ )capture_outputr  r  )rR  rN  rS  rd  zaot linkage command: %saz// Compile cmd
// z// Link cmd
// z_linker_flags.jsonz_serialized_weights.binr  qi @  za+b    )}sysr]  r   r*   r5   r.   r0   rS  r  get_command_liner    r   r<  r+  rC  rB  r   r  Zmodel_name_for_generated_filestempfileNamedTemporaryFile
writelinesflushrX   r  r}   r   packageru  r  r  rB   r   r  r   r  r   torch.utils._filelockrs  r   r   r   LOCK_TIMEOUTwith_suffixr   rN  	with_namerh  r   r   r   r   r$  r   r   r  r   Zpackage_constants_in_soZpackage_constants_on_diskrK   rt  Zforce_mmap_weightsZprecompile_headersr#  _get_cpp_wrapper_header_precompile_headerprecompiled_header_get_cpp_prefix_headerri  ri  Zsave_flags_to_jsonZsave_compile_cmd_to_cmakeZsave_src_to_cmakerj  r!   ZCppCompileErrorr   r   r  r   randintZiinfoint64maxitemstructpack	enumerateZtorchbind_constantsZ_libraryZfake_class_registryZFakeScriptObjectZreal_objr'  r(  rL   Z_pickle_saver   r   r   ROCmCodeCacheCUDACodeCacheaot_kernels_or  rD  r2   r=  r   r   _nvcc_arch_as_compile_optionr   
subprocessrunr;  Zembed_kernel_binaryr-   r3   r)   Zsave_kernel_asm_to_cmakeextendZsave_link_cmd_to_cmaker&  resourceZgetpagesizetell)Wr  rO  rK  rL  rM  rN  rO  Zgenerated_filesZpicked_vec_isaZvec_isa_cmd_genZcpp_commandZspecified_output_pathZspecified_artifact_nameZwrapper_keyrF  r2  Zwrapper_path_operatorZkernel_path_operatorZ
cmake_pathrq  rs  r   lockZextern_kernel_nodes_jsonr'  rN  Z	meta_jsonr{  r|  Zkernel_meta_jsonZ	output_soZserialized_weightsZweights_dictZconsts_sizer  compile_commandZwrapper_build_optionsZkernel_build_optionsZheader_fileZ
cpp_prefixZwrapper_builderZwrapper_compile_cmdZ	wrapper_oZkernel_builderZkernel_compile_cmdZkernel_oZcompile_flagsr   Zaot_constantsZmagic_numberrl  Zcustom_obj_idxZqual_name_to_idr   ZconstantZcustom_obj_nameZcustom_obj_bytesZcustom_obj_pathZconstants_config_jsonZgpu_codecacheZgpu_kernels_oZcubins_oZ	asm_filesZldZobjcopykernel_namer   Zasm_fileZ
cubin_fileZcurrent_archcmdZoutput_namerf  Zso_build_optionsZobj_srcsZ
so_builderZlink_cmdZlinker_flagsZweight_fileZ	f_weightsr`  Zo_filer  Z
page_size_Z	page_sizeZf_soZso_sizerd   )
r  r{  rN  rO  rL  r[  rp  rd  rK  rZ  rf   compile  s   




	2





	



N*


4
















0






2
2






2





2
*
zAotCodeCompiler.compileN)r   r   r   r,  r  r  rd   rd   rd   rf   rJ    s   rJ  zOptional[CDLL]_libgompz%Union[list[c_void_p], c_void_p, None])r   r`   rb   c                   sf  ddd fdd  fdd|D }|  ds<J | d d }t| d	D ]$\}}|d
krht|}t||}qNt|sJ | d t }t|j	j
|D ]\}}|jr|||j< q|r|t| d = ||i |}	|	d u rd S t|	ttfr>dd |	D }	t|	D ]$\}}
t|
tjs
J | d q
tjj|	S t|	tjsXJ | d tjj|	S )Nr   )argrb   c                   sN   t t| dkrtjj| S t| ttfrFt|  fdd| D S | S d S )Nz<class 'PyCapsule'>c                 3  s   | ]} |V  qd S rc   rd   r   r  convert_argrd   rf   r     r   z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>)	r   rn  r   r'  _aotiZ&alloc_tensor_by_stealing_from_void_ptrr   rr  r  )r  r  rd   rf   r    s
    z&custom_op_wrapper.<locals>.convert_argc                   s   g | ]} |qS rd   rd   r   r  r  rd   rf   r    r   z%custom_op_wrapper.<locals>.<listcomp>z
torch.ops.z, can not be called through custom_op_wrapperr  r   z, can not be loaded through custom_op_wrapperc                 S  s"   g | ]}|d u rt g n|qS rc   )r   Ztensor)r   rrd   rd   rf   r    r   z returns a list of non-tensorsz returns a non-tensor)
startswithr  r;  	importlibimport_moduler$  r  rv  zipZ_schema	argumentsZ
kwarg_onlyr   rt  r   rr  r  r   r   r'  r  Z#unsafe_alloc_void_ptrs_from_tensorsZ!unsafe_alloc_void_ptr_from_tensor)r   r`   Zconverted_argsr  rQ  rW  ra   Zfunc_argZconv_argresultr  rd   r  rf   custom_op_wrapper  s6    	
r  Zprecompiled_headersr   )headerhashable_cmd_liner  rb   c              	   K  s$  t rJ dt }t|d }|d|  d tt|d d t|tf i |ddid}|  d	d	d
dd}||	 }W d    n1 s0    Y  tf i |ddi}t
d|  dd|| t|  td\}	}
t|
|
|d}tjtdd ttjt|	 d|f |
S )Nz>CppBuilder does not currently support precompiling on Windows!z
header.hppz
#include <z>
ZpreprocessingTrT  r   r  rb   c                 S  s$   t jdd| fddd}|j d S )zReading the whole preprocessed header in for hashing is very expensive,
            but calling a fast hashing utility in a subprocess is cheap.Zopensslsha512T)r  r  r   )r  r  stdoutr;  )r  Z
cmd_outputrd   rd   rf   _get_file_checksum$	  s    z._precompile_header.<locals>._get_file_checksumZprecompilingrz  )r  r   r   r   r   rt  )r#  r  TemporaryDirectoryr   r  r.   r   r0   rj  ri  r  r1   Zget_compiler_HEADER_DIRr   r   _HEADER_LOCK_DIR_worker_compile_cppr   r   )r  r  r  Zpreprocessing_dirZpreprocessing_headerpreprocessorr  Zpreprocessor_hashZheader_build_optionZheader_hashZheader_full_pathcpp_builderrd   rd   rf   r  
	  sH    

	*



r  c                 C  s   |  drdS d S )NrW  z torch/csrc/inductor/cpp_prefix.h)r  r   rd   rd   rf   r  J	  s    
r  )r   rS  rb   c                 C  s@   |  dd }tjjo|dk}d|r(dnd d|r6dn| d	S )
zGiven a device type (and optionally whether we're in AOT Inductor mode), returns
    the path to the cpp_wrapper header file to be precompiled.:r   rW  ztorch/csrc/inductor/Zaoti_includeZcpp_wrapper/Z	array_refz.h)r;  r    r+  Zallow_stack_allocation)r   rS  Zbase_deviceZis_array_refrd   rd   rf   r  P	  s    

r  c                	   @  s   e Zd ZU dZi Zded< eejZi Z	ded< eddddd	d
Z
eddddddZedddddZeddddddddddZeddddddZdS )CppCodeCachezCompiles and caches C++ libraries.  Users of this class supply the source code to
    be compiled, while compilation flags are set by CppBuilder.0dict[str, Callable[[], Union[CDLL, ModuleType]]]r   r   cpp_compile_command_flagsr   zUnion[CDLL, ModuleType]r   r   rb   c                 C  s
   t | S rc   )r   LoadLibrary)r   r   rd   rd   rf   _load_library_innere	  s    z CppCodeCache._load_library_innerc              
   C  s   z|  ||}||_|W S  ttfy } zdt|v rptjdrpt	da
|  ||}||_|W  Y d }~S dt|v rt| dt  dt  d| W Y d }~n
d }~0 0 d S )NZgompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorr  r   r   r   r   r   r  r  r  
gettempdir)r  r   r   r  r   rd   rd   rf   _load_libraryi	  s&    
zCppCodeCache._load_library
str | Noner   c                 C  s   dS )z_
        Given a device type, returns the path to a CPP header file to be precompiled.
        Nrd   r  r   rd   rd   rf   _get_uncompiled_header	  s    z#CppCodeCache._get_uncompiled_headerrW  Nrd   r   Sequence[str]r
  )	main_coderN  	submit_fnextra_flagsoptimized_coderb   c                   sb  i j ||t t d}t  tf t||dud|}tf ddi|}dddd	d
}	|	|}
|	|}t|d| d|
 d\}|rt|d|d\}}ntj	}j
vrXddlm} tjt d }ddtjrBtsB| }rt||
fd|dui||_|rBt| }rBt||fi ||_t|\}}t||||d}|rt|\}}t||||d}t|| | gtf i ||d}tt||||ft|  ntt||ft|  dd fdd}|durN||td& tj s.|W d   n1 sD0    Y  |j
< j
 S )z\Compile and load a C++ library.  Returns a callable that returns the loaded
        library.)rN  r  rd  rR  Nr  rc  Trp   r   )build_optionrb   c                 S  s   t dd| d S )zWriting the code to file will calculate a hash, which we need to vary if
            the command line flags change.  This implements a mostly-generic way of
            validating that.rP  rQ  rT  )r.   r  )r  rd   rd   rf   get_hashable_command_line	  s    z:CppCodeCache.load_async.<locals>.get_hashable_command_linezmain.cppr  r   zoptimized.cppr   rr  rt  r  )r   rU  rV  rf  r   rh   c                    sF   d u rBd ur    } | d u s*J  d usBJ S rc   )r  r  )r  Zbinary_pathr  futurer   r  Z	worker_fnrd   rf   load_fn
  s    z(CppCodeCache.load_async.<locals>.load_fnru  )r  r    r   r5   r*   r0   rg   r  r   devnullr   r  rs  r   r   r   Zcpp_cache_precompile_headersr#  r  r  r  r  r3   r.   ri  r   r
   r  r4   r  r   )r  r  rN  r  r  r  r  Zmain_build_optionZoptimized_build_optionr  Zmain_cmd_lineZoptimized_cmd_lineZ	main_pathrF  Zoptimized_pathrs  	lock_pathr  Z	main_namerf  Zmain_builderZoptimized_nameZoptimized_builderZlinkerr  rd   r  rf   
load_async	  s    



(
zCppCodeCache.load_asyncr_   c                 O  s   | j |i | S rc   )r  r  r`   ra   rd   rd   rf   r   
  s    zCppCodeCache.load)rW  Nrd   N)r   r   r   r,  r   r-  r   r  rI  r  r  r  r  r  r  r   rd   rd   rd   rf   r  \	  s&   

     r  zSequence[CppBuilder])r  cpp_buildersrb   c                 C  s^   ddl m} || td2 |D ]}tj| s|  qW d    n1 sP0    Y  d S )Nr   rr  ru  )r  rs  r  r   r   r   ri  rj  )r  r  rs  Zbuilderrd   rd   rf   r   
  s
    r  c                   @  s   e Zd ZU i Zded< eejZdddZdZ	dZ
dZed	Zed
d
ddddZed
ddddZed!dd
d
ddddddddZedddddd ZdS )"CppPythonBindingsCodeCacher  r   FTZinclude_pytorchZsharedr  zkernel({}); Py_RETURN_NONE;r   aU  
        // Python bindings to call {entry_func}():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {{
            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }}
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }}
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {{
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }}

        {extra_parse_arg}

        static PyObject* {entry_func}_py(PyObject* self, PyObject* args) {{
            try {{
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != {arg_len}))
                    throw std::runtime_error("requires {arg_len} args");
                {call_entry_func}
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}

        static PyMethodDef py_methods[] = {{
            {{"{entry_func}", {entry_func}_py, METH_VARARGS, ""}},
            {{NULL, NULL, 0, NULL}}}};

        static struct PyModuleDef py_module =
            {{PyModuleDef_HEAD_INIT, "{entry_func}", NULL, -1, py_methods}};

        PyMODINIT_FUNC PyInit_{entry_func}(void) {{
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {{
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }}
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            PyObject* module = PyModule_Create(&py_module);
            if (module == NULL) {{
                return NULL;
            }}
            #ifdef Py_GIL_DISABLED
                PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
            #endif
            return module;
        }}
        r   r   r  c                 C  s   t tjjjjtjd< | d| j }zt	j
| W S  tyD   Y n0 tj||}|d us`J tj|}|t	j
|< |jd usJ |j| |S )NZ'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRr  )r   r   r'  Z_dynamor  Z'_torchinductor_pyobject_tensor_data_ptrr   environentry_functionr  r  KeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)r  r   r   module_namer  r  rd   rd   rf   r  
  s    


z.CppPythonBindingsCodeCache._load_library_innerr  r   c                 C  s   t |S rc   )r  r  rd   rd   rf   r  
  s    z1CppPythonBindingsCodeCache._get_uncompiled_headerrW  r   Nrd   r  r  r   r
  )argtypesr  rN  num_outputsr  r  rL  rb   c                   sz   d dd t|D } jjt| j| j jj|dd}	 j||	 ||||dddd	 fd
d}
|
S )aV  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            main_code: C++ source code containing ENTRY_FUNCTION().  Will be built at
                -O3 if kernel_code is None (to maximize performance in any kernels that
                are present), or -O1 otherwise (to minimize compile time).
            kernel_code: If present, C++ source code that will be built at -O3 and
                linked to main_code.

        Returns:
            A python version of ENTRY_FUNCTION()
        , c                 s  s,   | ]$\}}d | dd d| dV  qdS )z
parse_arg<zconst r   z>(args, )N)r5  )r   nZargtyperd   rd   rf   r   
  s   zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>)Z	array_len)Zarg_lenZcall_entry_funcZ
entry_funcextra_parse_arg)r  r  r  Nr   rh   c                     s(   d u r t tsJ t jS rc   )r   r   r$  r  rd   r  Z
get_resultr  rd   rf   r  
  s    z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.future)	r   r  suffix_templateformatrt  call_entry_functionr  r  r  )r  r  r  rN  r  r  r  rL  Z	parseargssuffixr  rd   r   rf   load_pybinding_async
  s&    

z/CppPythonBindingsCodeCache.load_pybinding_asyncr_   c                 O  s   | j |i | S rc   )r  r  rd   rd   rf   load_pybinding
  s    z)CppPythonBindingsCodeCache.load_pybinding)rW  r   Nrd   N)r   r   r   r   r-  r   r  rI  r  r  r  r  textwrapdedentr  r  r  r  r  r  rd   rd   rd   rf   r  -
  s0   

Y      4r  c                   @  sT   e Zd ZU i Zded< eejZdddZdZ	dZ
edZedd	d
ddZdS )CppWrapperCodeCacher  r   Tr  Zinductor_entry_cppzreturn inductor_entry_cpp({});a	  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {{
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }}
            return result;
        }}

        static inline PyObject* pack_tensor_handle_list(const std::array<AtenTensorHandle, {array_len}>& arr) {{
            PyObject* result = PyList_New({array_len});
            for (size_t i = 0; i < {array_len}; i++) {{
                PyObject *elem =
                    arr[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(arr[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }}
            return result;
        }}

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {{
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }}

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {{
            // For outputs, we only allocate an array to hold returned tensor handles,
            // not the actual output tensor storage.
            std::array<AtenTensorHandle, {array_len}> output_handles{{}};
            try {{
                inductor_entry_impl(input_handles.data(), output_handles.data());
                if (PyErr_Occurred()) {{
                    return nullptr;
                }}
                return pack_tensor_handle_list(output_handles);
            }} catch(std::exception const& e) {{
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            }} catch(...) {{
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }}
        }}
        r   r  r   c                 C  s   t |S rc   )r  r  rd   rd   rf   r  "  s    z*CppWrapperCodeCache._get_uncompiled_headerN)r   r   r   r   r-  r   r  rI  r  r  r  r  r  r  r  r  rd   rd   rd   rf   r	  
  s   

6r	  c                   @  sF  e Zd ZU i Zded< eejZdZded< e	
dZee	
d Zee	
d Ze	
d	Zed
ddddddZeddd
dddZeejd
dddZed
d
d
dddZeejd
d
dddZeejd
d
ddd Zed/dd
d!d"d#d$d%Zed!d!d"d&d'd(Zed
dd)d*Zed
d+d,d-d.ZdS )0HalideCodeCachez0dict[str, Callable[[], Union[ModuleType, CDLL]]]r   Nr
  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        r   rx   rg   rl  )r   r  r   rb   c                 C  sV  |j d usJ |jd ur,t|j t|jks0J |jd us>J |jpH|j d|j }|rtd| d}d}d}d}nd}d}d| d}d	}g }	t|j |jD ] \}
}|	d
|
 d| d qd| dd| dd|	 d| d| d| d| d| d| d| d| d| d|	  d| dt|	 d| d| d| dg
S )Nz + zreinterpret_cast<uint64_t>(r  Zcuda_interfaceZnullptrZhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(Zhalide_buffer_flag_host_dirtyzhalide_dimension_t(0, r  zhalide_buffer_t ;zhalide_dimension_t z_dims[] = {z};z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestridert  offsetalias_ofr   r  ru  r   Zhalide_type)r  r   r  r   r  r   Zdevice_interfacehostflagsdimssizer  rd   rd   rf   _codegen_bufferu  s6    "
zHalideCodeCache._codegen_bufferry   object)r6  
headerfilerb   c           
      C  s   |  }|d|jv u sJ d|jv s(J g }g }t|jD ]V\}}| rv|d|  || d| || q:d|jvsJ ||j	 q:d
dd |D  }|r| jn| j}|j| |rd	nd
|d
dd |jD |d
|d}	|	S )NZuser_contextZ
no_runtimez&hl_buf_Zhl_buf_*r  c                 S  s   g | ]}d | qS )    rd   )r   linerd   rd   rf   r    r   z1HalideCodeCache._codegen_glue.<locals>.<listcomp>HalideRuntimeCuda.hzHalideRuntime.hr  c                 s  s,   | ]$}|j d u r|  d|j V  qd S )Nr  )r  bindings_typer   r  rd   rd   rf   r     s   
z0HalideCodeCache._codegen_glue.<locals>.<genexpr>)Zhalideruntime_hr  Zargdefsbuffersbuffer_names)ry  r  r  r  Z	is_bufferru  r  r  ctyper   r   lstripglue_template_cudaglue_template_cppr  find_header)
r  r6  r  ry  r  r  rQ  r  Zglue_templateZ	glue_coderd   rd   rf   _codegen_glue  s2    

zHalideCodeCache._codegen_gluerh   c                 C  s:   t ddt d}| }td| j| j| j|gdS )NOIrT  r  r   )	r.   r/   r  r   r   r#  r"  standalone_runtime_cuda_initr   )r  Zcommand_genZcommand_linerd   rd   rf   config_hash  s"    zHalideCodeCache.config_hash)r  errmsgrb   c           	   
   C  s  t jjd}|d u s|js$tdz|jd }t|D ]}|dr:zt	
dtj||g}W n t	jy|   Y q:Y n0 td|d}|r:tjtj|d| }tj|r:tj|  W S q:W n0 ty } zt||W Y d }~n
d }~0 0 t|d S )	Nhalidez$halide python bindings not installedr   r9  Zlddz(/.*)/libHalide.sor   rQ   )r  	machinery
PathFinderr  r  r   r   r  r:  r  check_outputr   r   SubprocessErrorr\  searchr   abspathgroupr   r  )	r  r*  r  r0  fileoutmr   r   rd   rd   rf   _search_for_file  s*    


 z HalideCodeCache._search_for_filer   rb   c                 C  sV   d|    d}dtjv r>tjtjd |}tj|r>|S d| d}t||S )NZlibautoschedule_r9  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r   r   r  r   r   r   r
  r6  )r   Zsofiler   r*  rd   rd   rf   find_libautoschedule  s    

z$HalideCodeCache.find_libautoschedulec                 C  s   dt jv r.t jt jd | }t j|r.|S dt jv rjt jt jt jd d|  }t j|rj|S d|  d}td|  |S )NZHALIDE_INCLUDEr8  z../include/r9  z7, set env HALIDE_INCLUDE to the directory containing it)r   r  r   r   r   r1  r
  r6  )r   r   r*  rd   rd   rf   r$    s    


zHalideCodeCache.find_headerr   Callable[[], Any])r6  r  r  rb   c              
     s  t tt|t|  |fddd }tj|dd d t|d }t|d }t|d }t|d	 }t|d
 }	tj	| }
g }|
rt
|| tj|ddd| ddddg
}|jr|d| |jg ||  |ttj| dd |jD }| r|d | j|| ||||  f|
r8|jnd | rHdndd |
r|tt| tt|	|}|r||jn|  dd fdd}|S )Nr  r+     Tr   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdoner  -gr  -oz-fZhalide_kernelz-ezstatic_library,h,schedulez-pc                 S  s   g | ]}|j d u r| qS rc   )r  r  r  rd   rd   rf   r  +  s   z9HalideCodeCache.generate_halide_async.<locals>.<listcomp>Z	uintptr_tr   rW  )r  r  rN  r;  rh   c                     s   r
    S rc   rd   rd   Zbindings_futureZwait_for_compilerd   rf   r   @  s    z3HalideCodeCache.generate_halide_async.<locals>.load)r   r  r   r  r)  r   r   r   r   r   r   r  
executableZ	schedulerr  r:  r`   ru  r   r
   r  
check_callr  ry  r  r%  build_standalone_runtimetouch_worker_task_halider  )r  r6  r  r  dirpathZgenfileZlibfiler  ZdonefilelockfileZneed_compilejobsr  Zbinding_typesZtaskr   rd   r@  rf   generate_halide_async  sr    	




z%HalideCodeCache.generate_halide_asyncr_   c                 O  s   | j |i | S rc   )rI  r  rd   rd   rf   generate_halideG  s    zHalideCodeCache.generate_halidec              	   C  s  | j rtj| j r| j S tj r(dnd}d}|dkr<dnd}| j r`tj| j rXJ t }nt }t	|d| d| 
   }tj|dd	 t|d
 }t|d }t|d }t|d }	t|| }
tj|sdd l}ddlm} ||t tj|st|d2}|dkr6|| j| d W d    n1 sL0    Y  ||	|| t|
\}}t|||	g|t|dd}tt|  t | W d    n1 s0    Y  tj|
sJ |
| _ |
S )Nr   rW  zlibStandaloneHalideRuntime.soz	host-cudar  zhalide-runtime--Tr   r=  r  z	hooks.cppzstandalone_halide_runtime.ar   rr  r  r  rN  re  )!r  r   r   r   r   r   r  r=   r<   r   r)  r   r   r+  r  rs  r  r   r  r(  r  r$  Zcompile_standalone_runtimeZTargetr3   r.   r0   r  rB  shlexr;  r  rD  )r  rN  Zlibnamer  baserF  Z	done_fileZ	lock_fileZ	hook_fileZa_fileZso_fileZhlrs  r'  r   rf  Zhalide_cmd_genrd   rd   rf   rC  K  s^    
$	(z(HalideCodeCache.build_standalone_runtimer  r   c                 C  s   dS )z5Header precompiling is currently disabled for halide.Nrd   r  rd   rd   rf   r    s    z&HalideCodeCache._get_uncompiled_header)N)r   r   r   r   r-  r   r  rI  r  r  r  r  r#  r"  r(  r  r  r%  r   r)  r6  r:  r$  rI  rJ  rC  r  rd   rd   rd   rf   r
  '  sN   

	!! D:r
  zlist[partial[Any]])rG  rH  rb   c                 C  s  ddl m} z@|| t  |D ]
}|  qW d    n1 s>0    Y  W n0 tjy| } ztjddkrft|dd^}}}tj	
|drft| }d}	||	d	ksJ G d
d d}
|d}t|tsJ |
 ||d	 < ttddg|dd}||	|}tdd}||  W d    n1 sL0    Y  td| | W Y d }~n
d }~0 0 d S )Nr   rr  ZHALIDE_REPRO1r  )r   r   r   pythonz    hl.main()rQ   c                   @  s   e Zd ZddddZdS )z _worker_task_halide.<locals>.Outr   rh   c                 S  s   dS )Nr4  rd   r   rd   rd   rf   __repr__  s    z)_worker_task_halide.<locals>.Out.__repr__N)r   r   r   rQ  rd   rd   rd   rf   Out  s   rR  r?  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        r  r  zwrote repro.py: )r  rs  r  r  r/  r   r  r   r$  r   r   r  r   r  countindexr   r  r  r   r  r5  r  r!  r   )rG  rH  rs  Zjobr   rP  scriptr  r   mainrR  cireplfdrd   rd   rf   rE    s6    ,
.rE  r  c                 C  s   t | d  d S )Nr  )r   close)r  rd   rd   rf   rD    s    rD  c                   @  s   e Zd ZU g Zded< i Zded< i Zded< ed!ddd	d
ddZed"dddd
ddZ	ed#ddddddddZ
ed$dddddZeejdddddd ZdS )%PyCodeCachezlist[ModuleType]r  zdict[str, ModuleType]modules_no_attrz dict[str, list[tuple[Any, ...]]]linemapsr   r   r  )r  r   rb   c                 C  s   t |d|dS Npyr  r  )r  r  r   rd   rd   rf   r    s    zPyCodeCache.writer   c                 C  s   t |d|d\}}| ||S r^  )r  load_by_key_path)r  r  r   r   r   rd   rd   rf   r     s    zPyCodeCache.loadNzOptional[list[tuple[int, str]]]r   )r   r   linemapattrsrb   c           	      C  s   |d u rg }|d u r(|| j v r(| j | S t }t|||d}|rRtt| | j|< |d urx| D ]\}}t||| qb|r|d u r|| j |< | j	| |S )N)Zset_sys_modules)
r\  r(   r;   rr  r  r]  r   setattrr  ru  )	r  r   r   ra  rb  Zin_toplevelmodr{  r|  rd   rd   rf   r`    s     

zPyCodeCache.load_by_key_pathFrg   r^   )purgerb   c              	   C  sV   |r>| j D ]2}z|jsJ t|j W q
 ty:   Y q
0 q
| j   | j  dS )z
        Clear the in-memory module cache. If purge=True, also delete all the
        corresponding on-disk source files.
        N)r  r  r   r&  r8  r  r\  )r  re  rd  rd   rd   rf   rI    s    


zPyCodeCache.cache_clearr  zOptional[list[dict[str, Any]]])r   linenorb   c                 C  st   || j vrd S t| j | dkr$d S | j | \}}t||}|dkrHd S ||d  }|s\d S ddddd}||S )Nr   rQ   r   zlist[dict[str, Any]])stack_tracerb   c                 S  s"   d}t || }dd t|D S )Nz"File "(.+)", line (\d+), in (.+)\nc                 S  s"   g | ]\}}}|t ||d qS ))r  r  r   )r  )r   r'  lr  rd   rd   rf   r    s   zPPyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace.<locals>.<listcomp>)r\  findallreversed)rg  regexmatchesrd   rd   rf   parse_stack_trace  s
    z<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace)r]  rt  r   )r  r   rf  rw  r%  r,  entryrm  rd   rd   rf   stack_frames_for_code  s    


z!PyCodeCache.stack_frames_for_code)r   )r   )NN)F)r   r   r   r  r-  r\  r]  r  r  r   r`  rI  r   r   ro  rd   rd   rd   rf   r[    s    
  !r[  rz   )r  r  rb   c                 C  s   t t|| S rc   )r$  r[  r   )r  r  rd   rd   rf   _load_triton_kernel_from_source!  s    rp  c                   C  s~   t tjjrtjjS t r0tjt	j
ddS t tdrLtddS t tdrztjtjtdddS dS )NbinZnvccZCUDACXXr   Z	CUDA_HOMEzbin/nvcc)r%   r   r    r   Zcuda_cxxr   r   r   r   rY   Zsdk_homegetenvrealpathrd   rd   rd   rf   r   '  s    r   c                  C  s*   t  rddlm}  | dS t jjS d S )Nr   r  zcutlass-3-headers)r    r   r  r  Zget_dir_pathr   cutlass_dirr  rd   rd   rf   _cutlass_path3  s    
ru  rl  c                   C  s   g dS )N)includeztools/library/includeztools/library/srcztools/util/includerd   rd   rd   rd   rf   _cutlass_paths<  s    rw  )
build_rootrb   c                 C  sH   t  }t }t  D ]0}tj||}tj| |}tj||dd q|S )NT)dirs_exist_ok)rw  ru  r   r   r   r$  copytree)rx  pathsZcutlass_rootr   old_pathnew_pathrd   rd   rf   _clone_cutlass_pathsE  s    
r~  c                     s   t    fddt D S )Nc                   s"   g | ]}t jt j |qS rd   )r   r   rs  r   r   r   Zcutlass_pathrd   rf   r  Q  s   z*_cutlass_include_paths.<locals>.<listcomp>)ru  rw  rd   rd   r  rf   _cutlass_include_pathsO  s    
r  c               	   C  s   t  rxtjddR} t| *}|  W  d   W  d   S 1 sP0    Y  W d   n1 sn0    Y  t	 }t
t jjgd| | S )zz
    Compute a key representing the state of the CUTLASS library.

    Note: OSS and fbcode will have different keys.
    Zcutlasszsrc_hash.txtNr   )r    r   r  	resourcesr   r   r  r   r   r   r  r   rt  r   )resource_pathresource_fileZcombined_hashrd   rd   rf   cutlass_keyX  s    
Vr  c                  C  s   t   ddlm}  | jdd}t r>|tjtj	tjdg7 }g }t
 rt| |D ]*}d|v rbqT|d| dd	| g qT|d
 |d ntd|S )zO
    Util function for CUTLASS backend to find the correct CUDA libraries.
    r   )cpp_extensionr   rL  Zstubsz	torch/libz-Lz-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)r*   Ztorch.utilsr  Zlibrary_pathsr   rY   Zsdk_libr   r   r   r@   r,   r  ru  r  )r  ZlpathsZextra_ldflagsr   rd   rd   rf   _cuda_lib_optionsi  s*    
r  c                   C  s   g dS )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-Wconversionrd   rd   rd   rd   rf   _nvcc_host_compiler_options  s    r  c                  C  s$   t  } | dkrdS | dkr dS | S )NZ90Z90aZ100Z100a)r%   Zget_cuda_arch)archrd   rd   rf   r    s    r  c                  C  s   t  } d|  d|  g}tjjr2|d|  g7 }dddddd	|  d
d| dtjjdddg
}t r|dtj	
tjg tjjr|g d tjjr|g d tjjr|ddg |S )NZsm_Zcompute_Zlto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z+-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1z'-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLEDz-wz-gencode=arch=compute_z,code=[,]z
-std=c++17z--expt-relaxed-constexprz-DNDEBUGz-ccbin)z	-lineinfor>  z-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r  r    r   Zenable_cuda_ltor   Zcompile_opt_levelr   r  r   r   r  rY   ZgccZenable_debug_infoZenable_ptxas_infoZuse_fast_math)r  r   optionsrd   rd   rf   _nvcc_compiler_options  s<    	r  Optional[list[str]])	src_filesdst_filedst_file_ext
extra_argsrb   c                 C  s@  |d u rg }t  rBtj|}t|}dd | D } tj|}nt }t }t }t	 }|| dd |D  dd |D  | }	d
| }
d}|dkrt  dd
|	 d| d|
 }nt|d	kr|	d
 t  dd
|	 d| d|
 }n>|dkr t  dd
|	 d| d|
 }ntd| dtd| |S )Nc                 S  s   g | ]}t j|qS rd   )r   r   r   )r   src_filerd   rd   rf   r    r   z(cuda_compile_command.<locals>.<listcomp>c                 S  s(   g | ] }d |v rd| nd| qS )=z-Xcompiler z-Xcompiler=rd   )r   optrd   rd   rf   r    s   c                 S  s   g | ]}d | qS )z-Ird   r  rd   rd   rf   r    r   r  r   rP  z -c -o soz-sharedr  ZexezUnsupported output file suffix !zCUDA command: %s)r   r   r   r  r~  r   r  r  r  r  r   r   ru  r  ri  r  )r  r  r  r  Z
build_pathZinclude_pathsZcuda_lib_optionsZnvcc_host_compiler_optionsZnvcc_compiler_optionsr  r  resrd   rd   rf   cuda_compile_command  sF    

$
$
$r  c                   @  sx   e Zd ZdZdddddZdddd	Zddd
dZdddddZddddZdddddZ	ddddZ
dS )
DLLWrapperz A wrapper for a dynamic library.r   r^   )lib_pathrb   c                 C  s"   || _ d| _t|| _d| _d S )NFT)r  is_openr   r  DLL)r   r  rd   rd   rf   r     s    zDLLWrapper.__init__rh   c                 C  s   | j r|   d| _ d S ri   )r  _dlcloser   rd   rd   rf   rZ    s    zDLLWrapper.closec                 C  s   d }t  r6td }t|ds$td}t|drb|j}n,t rZdd l}|jddd}|j}ntd|d urt  rtg|_	|| j
j qt rdd l}ddlm} |jg|_	|| j
j n
td	 d S )
Ndlclosezlibc.sor   kernel32T)use_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)r@   r   r3  r  rA   r  ZFreeLibraryr  r   r  r  _handler  ZHMODULEri  rj  )r   Z	f_dlcloseZsymsr  r  r  rd   rd   rf   r    s0    


zDLLWrapper._dlclosezCallable[..., None]r7  c                   s:   | j std| j t| j| ddd fdd}|S )NzCannot use closed DLL library: r   r^   r`   rb   c                    s     |  }|rt d j d S )NzError in function: )r   r   )r`   errmethodrd   rf   _wrapped_func*  s    z-DLLWrapper.__getattr__.<locals>._wrapped_func)r  r   r  r$  r  )r   r   r  rd   r  rf   __getattr__$  s
    zDLLWrapper.__getattr__r   c                 C  s   | S rc   rd   r   rd   rd   rf   	__enter__1  s    zDLLWrapper.__enter__r   r  c                 G  s   |    d S rc   rZ  )r   r`   rd   rd   rf   __exit__4  s    zDLLWrapper.__exit__c                 C  s   |    d S rc   r  r   rd   rd   rf   __del__7  s    zDLLWrapper.__del__N)r   r   r   r,  r   rZ  r  r  r  r  r  rd   rd   rd   rf   r    s   	#r  )rC  rb   c                 C  s   | d S )z,
    standard format for the error path
    z.errorrd   )rC  rd   rd   rf   binary_error_path;  s    r  c                
   @  s   e Zd ZU dZejG dd dZi Zded< g Z	ded< dZ
ed	d
ddZeeddddddddZeddddddZed&dddddddZeddddd d!Zed'dddddd"d	d#d$d%ZdS )(r  aJ  
    A cache for managing the compilation and loading of CUDA source code specifically for CUTLASS.
    This class handles writing source code to files, compiling them into shared objects, and caching
    the results to avoid redundant compilations. It also manages error handling and logging for the
    compilation process.
    c                   @  s*   e Zd ZU ded< ded< dZded< dS )zCUDACodeCache.CacheEntryr   
input_pathrC  Nr
  
error_json)r   r   r   r-  r  rd   rd   rd   rf   
CacheEntryL  s   
r  dict[str, CacheEntry]r   rl  r  Zcur^   rh   c                   C  s   t j  t j  d S rc   )r  r   r  r  rd   rd   rd   rf   rI  V  s    
zCUDACodeCache.cache_clearr   )maxsizerg   r  )caching_enabledcaching_availablerb   c                 C  sR   | st d dS |sdS zddlm} | W S  tyL   t d Y dS 0 dS )ad  
        Get or create the class instance of the CUTLASSKernelBinaryRemoteCache.

        Args:
            caching_enabled: Whether binary remote caching is enabled
            caching_available: Whether we're in fbcode environment

        Returns:
            CUTLASSKernelBinaryRemoteCache: The class instance of the kernel binary remote cache
        z6CUTLASSKernelBinaryRemoteCache not requested, skippingNr   )CUTLASSKernelBinaryRemoteCachezECUTLASSKernelBinaryRemoteCache not available, remote caching disabled)ri  r  Z-torch._inductor.fb.kernel_binary_remote_cacher  r  )r  r  r  rd   rd   rf   get_kernel_binary_remote_cache[  s    
z,CUDACodeCache.get_kernel_binary_remote_cacher   r  r  r  rb   c                 C  sf   t jjr ttdgd|}|}n*t|dkrDt t t t g|g ng }t	|| j
|d\}}||fS )
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        dummy_inputdummy_outputrP  r  )r    r   Zcutlass_hash_with_compile_cmdr  r  r   r  r  r  r  _SOURCE_CODE_SUFFIX)r  r  r  cuda_commandr   r   r  rd   rd   rf   r  |  s&    zCUDACodeCache.writeNr  r   r  r  r  rb   c                 C  s  |  ||\}}|| jvrBddlm} t }|tj||d td}| |dt	| j
  | }	t|	}
| jtjjotj t d}|dur||	|
 tj|
r6t|
dd}| }W d   n1 s0    Y  t|\}}|durtjjr||
tjj t||	|| j|< t||tj|	st|g|	||}t|d	,}| d
 | d| d
 W d   n1 s0    Y  t  }t!"d| |#d}zJt$ rddl%m&} ||tj'|tj(|	 nt)j*|t)j+tj,d W n t)j-yV } z8| .|j/0d||||	| t||j/|W Y d}~njd}~0  t1y } zHdt2|v r| .t2|||||	| t|t2|||W Y d}~n
d}~0 0 t  }d||  d| }t!3| nt!"d| |durtjjr||	tjj t||	d| j|< W d   n1 s80    Y  | j| }|j4durzt|j4\}}t||5d| j| j6||fS )z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   rr  rt  ru  N)r  r  r   r  r  r  z// CUDA Compile cmd
// zCUDA Compilation: %sr  )run_build_command)stderrenvzCOMPILE FAILED WITHzCUDA Compilation took  seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)7r  r   r  rs  r   r   r   r   r  rt  r  r  r  r    r   Zuse_binary_remote_cacheZforce_disable_cachesr   r   r   r   r  r   r  upload_to_binary_remote_cacher"  binary_remote_cache_force_writer  r  r!   CUDACompileErrorr  r   ri  r  r;  r   r   r  r  r   r  r.  STDOUTr  CalledProcessError_record_cuda_compile_erroroutputr   r  r   r  r  r   rC  )r  r  r  r  r   r  rs  r   r  rC  
error_pathbinary_remote_cachefhr  	cmd_partsZerror_outputr  r'  
start_timer  errorend_timelog_duration_msgcache_entryrd   rd   rf   r    s    &


2





$4
zCUDACodeCache.compiletuple[DLLWrapper, str, str]c                 C  s<   |dkrt d| d| | ||\}}}t|||fS z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   r  r  r  r  r  Zdst_file_pathr   Zsource_code_pathrd   rd   rf   r     s    
zCUDACodeCache.loadr   )	error_strr   r  r  rC  r  rb   c           
      C  s   t ||g}t|||| j|< t|}t|ddd}	|	| W d    n1 sX0    Y  |d urtj	j
r||tj	j d S )Nr  r   r  )r   r   r  r  r   r  r   r  r    r   r  r"  r  )
r  r  r   r  r  rC  r  r  r  r  rd   rd   rf   r  !  s    (z(CUDACodeCache._record_cuda_compile_error)N)N)r   r   r   r,  r4  	dataclassr  r   r-  r  r  r   rI  r	   r  r  r  r  r   r  rd   rd   rd   rf   r  C  s*   
 s
 r  c                   @  s   e Zd ZU ejG dd dZi Zded< g Zded< dZ	dZ
ed	d
ddZeddddddZeddddddddZeddddddZdS )r  c                   @  s   e Zd ZU ded< ded< dS )zROCmCodeCache.CacheEntryr   r  rC  N)r   r   r   r-  rd   rd   rd   rf   r  ?  s   
r  r  r   rl  r  rX  Fr^   rh   c                   C  s   t j  t j  d S rc   )r  r   r  r  rd   rd   rd   rf   rI  I  s    
zROCmCodeCache.cache_clearr   r  r  c                 C  s.   t tdgd|}t|| j|d\}}||fS )r  r  r  r  )r  r&   r  r  )r  r  r  r  r   r  rd   rd   rf   r  N  s    
zROCmCodeCache.writeNr  r   r  c                 C  s  | j s d| _ tttt  | ||\}}|| jvrzddlm	} t
 }|tj||d td}| |dt| j  | }	tj|	s:t|g|	||}
t }|
d}z&tj|tjdtjd}td	| W n8 tjy } zt||j|W Y d}~n
d}~0 0 t }d
||  d|
 }t| ntd||	 t||	| j|< W d   n1 sp0    Y  | j| j ||fS )z
        Compiles source_code into a file with dst_file_ext extension,
        using the compile command specific for the ROCm platform.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        Tr   rr  rt  ru  Nr  )r  r  r  zCompilation output: %szCompilation took r  z+Skip compiling %s: output %s already exists)!_logged_compiler_versionri  r  r1   r   r'   r  r   r  rs  r   r   r   r   r  rt  r  r   r&   r   r;  r  r.  r  r  r  r!   r  r  r  r  r  rC  )r  r  r  r  r   r  rs  r   r  rC  r  r  r  r  r  r  r  rd   rd   rf   r  ]  sH    	

&2zROCmCodeCache.compiler  c                 C  s<   |dkrt d| d| | ||\}}}t|||fS r  r  r  rd   rd   rf   r     s    
zROCmCodeCache.load)N)r   r   r   r4  r  r  r   r-  r  r  r  r   rI  r  r  r  r   rd   rd   rd   rf   r  =  s   
 1r  c                   @  s   e Zd ZddddZdS )CodeCacheFutureCallable[..., Any]rh   c                 C  s   t d S rc   r  r   rd   rd   rf   r    s    zCodeCacheFuture.resultN)r   r   r   r  rd   rd   rd   rf   r    s   r  c                   @  s.   e Zd ZdddddddZddd	d
ZdS )LambdaFutureNr  zOptional[Future[Any]]r^   )	result_fnr  rb   c                 C  s   || _ || _d S rc   )r  r  )r   r  r  rd   rd   rf   r     s    zLambdaFuture.__init__rh   c                 C  s   |   S rc   )r  r   rd   rd   rf   r    s    zLambdaFuture.result)N)r   r   r   r   r  rd   rd   rd   rf   r    s    r  c                   @  s.   e Zd ZdZdddddZdddd	Zd
S )StaticAutotunerFuturezM
    A statically launchable CachingAutotuner, loaded from TritonBundler
    rz   r^   )static_autotunerrb   c                 C  s   || _ d | _d S rc   )r  reload_kernel_from_src)r   r  rd   rd   rf   r     s    zStaticAutotunerFuture.__init__rh   c                 C  sd   | j d usJ td: | jj| j d | jjd| j d d | jW  d    S 1 sV0    Y  d S )Nz%StaticAutotunerFuture.warm_precompile)r  F)Zwarm_cache_onlyZreload_kernelZstatic_triton_bundle_key)r  r   r  Zrecheck_autotune_cacheZ
precompiler   rd   rd   rf   r    s    
zStaticAutotunerFuture.resultN)r   r   r   r,  r   r  rd   rd   rd   rf   r    s   	r  )r   )r   )r   r   )r   r   r   N)FF)F)N(  
__future__r   r   r@  r4  r   r   r  importlib.resourcesr<  r  r   loggingr   r  r  r\  rM  r$  r  r  r  r  r  r  rS  bisectr   r   r  r   r   r   datetimer   r	   r
   pathlibr   r   r   typesr   typingr   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   Ztorch.distributedr  r  r   r   Ztorch._dynamo.excr   Ztorch._dynamo.utilsr   r   r   Ztorch._inductorr    r!   r"   Ztorch._inductor.codegen.commonr#   r$   Ztorch._inductor.codegen.cudar%   Z,torch._inductor.codegen.rocm.compile_commandr&   r'   Z$torch._inductor.compile_worker.utilsr(   Ztorch._inductor.cpp_builderr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   Ztorch._inductor.cpu_vec_isar5   Z!torch._inductor.custom_graph_passr6   r7   r8   Ztorch._inductor.freezing_utilsr9   r:   Z%torch._inductor.runtime.compile_tasksr;   Z%torch._inductor.runtime.runtime_utilsr<   r=   Ztorch._inductor.utilsr>   r?   r@   rA   Ztorch._loggingrB   Ztorch._subclasses.fake_tensorrC   rD   rE   Ztorch._utils_internalrF   Ztorch.compilerr  Ztorch.compiler._cacherG   rH   rI   Z)torch.export.pt2_archive._package_weightsrJ   rK   Z"torch.export.pt2_archive.constantsrL   Z%torch.fx.experimental.symbolic_shapesrM   rN   rO   Ztorch.utils._ordered_setrP   r}   rS   r  rT   ZruntimerU   Zruntime.autotune_cacherV   Ztriton_bundlerrW   ZvirtualizedrX   r   Ztriton.fb.buildrY   Ztorch._inductor.fb.utilsrZ   r[   r\   r]   rj   collections.abcrk   rl   rm   concurrent.futuresrn   r  ro   r  rp   rO  rr   Zirrs   rt   ru   rv   rw   Zruntime.hintsrx   ry   Zruntime.triton_heuristicsrz   r  r{   r]  r#  r  Z_loggingZgetArtifactLoggerr   r  	getLoggerri  r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r   r  r(  r0  r7  Picklerr8  r  r  r  r  r  r  rQ  rk  r  r  r  registerr  r   r<  r=  rJ  r  r-  r  r   r   r  r  r  r  r  r  r  r  r	  r
  rE  rD  r[  rp  r   ru  rw  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rd   rd   rd   rf   <module>   s  ,8


	E[	   
 
 4!	 %}   E
I    K3? D 5D  f)g		
	 	
-+K zc