a
    h                    @   sN0  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlmZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZm Z m!Z! d dlm"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z: d dl;m<Z= edZ>edZ?ej@jAZAejBCdddZDeEd\ZFZGZHe
e
e?e>f ge
e?e>f f dddZIe'dddZJdd ZKdd ZLeIeAjMeAjNge4 dddejOd d fd!d"ZPeIeAjQjReAjQjSge4 d#d$ ZTeIeAjUjReAjUjSge4 d%d&d'd(ZUdd)d*ZVd+d, ZWdd.d/ZXeIeAjYjRd0d1 ZZeIeAj[e4 d2d3 Z[eIeAj\jReAj\jSeAj]jReAj]jSge4d4d5d6d7 Z^eIeAj_jReAj_jSge4 d8d9 Z_d:d; Z`deeaeb ecd<d=d>ZdeIeAjejReAjejSge4 d?d@ ZfdZgeaeb d&dAdBZheIeAjijReAjijSge4 dCdD ZjeIeAjkjlddEdFdGZmeIeAjkjRejnddddHdIdJZoeIeAjpjReAjpjSge4 ejnddddHdKdLZqeIeAjpjreAjpjsge4 ejnddddHdMdNZteIeAjujReAjujSge4 dddddHdOdPZveIeAjwjReAjwjSge4 eeaeb ebebdQdRdSZxeIeAjyjRddTdUZzdVdW Z{eIeAj|jRdXdY Z}eIeAj~deeeee ee eej dZd[d\ZeIeAjdeeeeej d]d^d_ZeIeAjd`d`ddaeeeeeej dbdcddZeIeAjd ejejee ee eej ecebebebde	dfdgZeIeAjjRd-dheebeejeecedidjdkZeIeAjjRd-dheebeejeecedidldmZe4 eIeAjjRdndo ZeIeAjjRdddd d ddpeeee ee ee ebecedqdrdsZeIeAjjReAjjge4 dtdu ZeIeAjjd!dvdwZeIeAjjReAjjge4 dxdy ZeIeAjjd"dzd{ZeIeAjjRd|d} ZeIeAjjSd~d ZeIeAjjRdd ZeIeAjjdd ZeIeAjjRdd ZeIeAjjRddddddddZeIeAjjRd#ddZeIeAjjRd$ddZeIeAjjRd%ddZeIeAjjRdd ZeIeAjjdd ZeedddZeeedddZd&eeecdddZd'eeedddZeeecedddZd(eeeedddZedddZeIeAjjReAjjge4ddd)eeecdddZeIeAjjReAjjSge4 eedddZeIeAjge4ddedddZeedddZeIeAje4 eeecedddZeIeAje4 d*eeecedddZeIeAje4 d+eecedddZeIeAje4 d,eecedddZeIeAjjRd-eececdÜddńZeIeAjjReAjjSge4 eeedƜddȄZeIeAjjRd.eecdɜdd˄ZeIeAjjReAjjSge4ddd΃d d dϜeececeeeef dМdd҄ZeIeAjjReAjjSge4 d dӜeeeecedԜddքZeIeAjjReAjjSge4ddd؃d-dٜeeceeeef dڜdd܄ZeIeAjjReAjjSge4ddd΃d-d dޜeececeeeef dߜddZeIeAjjReAjjSge4 d-d deeeececedddZeIeAj΃e4ddd؃d/eeececeeeef dddZeeececf dddZeIeAjjReAjjSge4ddd0eeeeef dddZeIeAjjReAjjge4dddd̓eeeeeef dddZeIeAjjRd1eececee dddZeeeeaeb eaeb f dddZeeee eeef dddZeeecd ddZeIeAjڃd-d dddddeeececee ee ee ee eeeeef d	ddZeIeAjjReAjjSgd-d ddeeecececee edd	d
ZeIeAj݃e4ddd-dd2eeecececeeef dddZeIeAjjRdd ZeIeAje4 d3eeeececedddZdd Zdd ZeIeAje4 dd ZeIeAje4 dd Zdd ZeIeAje4d d!d" ZeIeAje4d d#d$ Zd%d& ZeIeAje4 d'd( ZeIeAje4 d)d* ZeIeAjjReAjjeAjjReAjjge4d d+d, Zd-d. ZeIeAje4 d/d0 ZeIeAje4 d1d2 ZeIeAjjReAjjeAjjReAjjge4d d3d4 ZeIeAje4 d4eeed6d7d8Z eIeAje4 eeeeed9d:d;ZeIeAjjReAjjSge4d-dd`d`d<d=d>ZeIeAjjReAjjSge4 ddEd?d@ZeIeAjjd5dBdCZeIeAjj	d6dDdEZ
eIeAjjReAjjSge4 d7dFdGZeIeAjjRd8dHdIZeIeAje4d-ddJdK ZdLdM ZdNddOdPZd9ejejeeaeb ebf eeaeb ebf eeaeb ebf ecebeeeaeb ebf  dQdRdSZdTdU ZeIeAjjRejejeej eej eej eceedVdWdXZeIeAjjRejejejeaeb eaeb eaeb eceaeb ebdY	dZd[ZejjrejBCd\ddZeIej@jjjRd]d^ ZeIej@jjjRd_d` Z ejj!r(ejBCdaddZ"eIej@j#j$dbdc Z%ejBCddddZ&eIej@j'j(jReIej@j'j)jRdedf Z*eIej@j'j(j+dgdh Z,eIej@j'j-jReIej@j'j-j.didj Z/eIej@j'j-j+eIej@j'j-j0dkdl Z1eIej@j'j2jReIej@j'j3jRdmdn Z4ejBCdoddZ5eIej@j6j7d:dsdtZ8eIej@j6j9dudv Z:dwdx Z;eIeAj<jRd;dydzZ=d{d| Z>eIeAj?jRd}d~ Z@eIeAjAe4 d<ddZBeIeAjCe4d dd ZDeIeAjEjRdd ZFeIeAjGjRdd ZHeIeAjIjRdd ZJeIeAjKe4d dd ZLeedddZMeIeAjNe4dd5dd ZOeIeAjPe4d dd ZQeIeAjRe4dd5dd ZSeIeAjTe4d dd ZUeIeAjVjd=ddZWeIeAjXjReAjXjSge4 dd ZYeIeAjZjReAjZjSge4 d%debdddZZeIej@jAj[jRej@jAj[jSge4 dd Z[eIeAj\jeAj]jgdd Z^eIeAj_jRgdd Z`eIeAjajReAjajSge4d-dd`d`d<ddZbeIeAjcjgdd ZdeIeAjejReAjfjRgdddddZgeIeAjhjRgdddddZieIeAjjge4 dd ZkeIeAjlgdd ZmeIeAjngdd ZoeIeAjpgdd ZqeIeAjrgdd ZseIeAjtgdd ZtebebebdddZudd ZveIeAjwgee dddZxeIeAjygdd ZzeIeAj{gdÐdĄ Z|eIeAj}jRdŐdƄ Z~eIeAje4 dǐdȄ ZeIeAjjRd>dɐdʄZeIeAjjRdːd̄ Zd?d͐d΄ZeIeAjjReAjjSge4 d@ddϜdАdфZeIeAjjReAjjRgdҐdӄ ZeIeAjjeAjjeAjjeAjjeAjjReAjjge4d4d5dAdԐdՄZeIeAjjRd֐dׄ ZeIeAjjRdؐdل ZeIeAjjRdڐdۄ ZeIeAjjeAjjeAjjeAjjeAjjReAjjReAjjRgdܐd݄ ZeIeAjjeAjjeAjjeAjjgdBdސd߄ZeIeAjjReAjjgdd Zdd ZeIeAjjeAjjgdd ZeIeAjjeAjjgdd ZeIeAjjRdd ZeIeAjjeAjjgdd ZeIeAjjeAjjgdd ZeIeAjjRdd ZeIeAjje4 dCedddZeIeAjge4 dDddZeIeAjgdEddZeIeAjgdFddZeIeAjjReAjjRgdGddZeIeAjjdd ZeIeAjjRdd ZeIeAjd d ZeIeAje4 dd ZeIeAjdd ZeIeAjjRdHddZeIeAjjRdd	 ZǐdId
dZeIeAjjRdd ZeIeAjjdd Zːdd Z̐dd Z͐dd Zΐdd ZϐdJeebebebebebebebebebebebebebebebebebebebeecdddZАdd ZeeebebebebebebebebebebebebebebebebedddZҐd d! ZeIeAjjRd"d# ZeIeAjjRdKd$d%ZeIeAjjRd&d' ZeIeAjڃe4dd5dLd(d)ZeIeAj܃e4d d*d+ Zeed,d-d.ZG d/d0 d0eZeeebd1d2d3ZeIeAjjRd4d5 ZeIeAje4 d6d7 ZeIeAje4d d8d9d: ZeIeAjjRgd;d< ZeIeAjjRdMd=d>ZeIeAjjReAjjSge4 ddddd d?d@dAZeIeAjjReAjjSge4 ddddd d?dBdCZeIeAjjbdDdE ZeIeAjjRdFdG ZeIeAjjRdNdHdIZdOebebecdJdKdLZdMdN ZdOdP ZeIeAjjRdPdQdRZdQdSdTZdRdUdVZdWdX ZdSdYdZZdTd[d\ZeIeAjjRd]d^ ZeIeAjd_d` ZeIeAjj eAjjeAjjeAjjge4 dUdadbZeIeAjj eAjjeAjjeAjjgdVdcddZeIeAjgdWeeeeececee dfdgdhZeeebdif djdkdlZeIeAj	gdXeeeee eceececee dm	dndoZ
eIeAjgdYeeeee eececee dpdqdrZeIeAjgdZeeeeeeeeebebeeceeee dsdtduZeIeAjgd[eeeeecee ee dvdwdxZeIeAjgd\eeeeeeeecee ee dy
dzd{ZeIeAjgd]eeeee ececee d|d}d~ZeIeAjgd^eeeeee eeeeeeaec ecee dddZeIeAjgd_eeeeeeeeeeeebebeecee dddZeIeAjgd`eeeee ee ebebeececee eeb eeb ee ee dddZeIeAjgdaeeeeeeeeebebeeceeee eeb eeb dddZeIeAjgdbeeeee ee ee eeb eeb eebecee ee ee eeb dddZeIeAjgdceeeeee ee ee ej ej eeeeebecee eeb ecdddZ!eIeAj"jRgddejejejejeej eej eej ecdddZ#eIeAj$j%eAj$j&ge4 deddZ'eIeAj(j%dfddZ)eIeAj*jReAj*jSge4 dgddEddZ+dd Z,dd Z-eIeAj.jReAj/jRgdhddZ.eIeAj0jReAj1jRgdiddZ0eIeAj2jReAj3jRgdjeeeebej f  eeebej f  ee ee dddZ2eIeAj4jReAj5jRgdkddZ4eIeAj6jReAj6j7eAj6jeAj6j8gdlddZ9dd Z:eIeAj;jRdmddZ<eIeAj=jRdd Z=eIeAj>jRdd Z>dd Z?dd Z@eIeAjAjReAjBjRgdnddZCeIeAjDjRdoddZDeIeAjEjRdpddZFeIeAjGe4 dqddZHeIeAjIjReAjIjge4d4d5drddZJejKZLdd ZMeIeAjNjRdd ZNeIeAjOjRdÐdĄ ZOeIeAjPjRdŐdƄ ZQeIeAjRjRdǐdȄ ZReIeAjSjeAjSjTge4 d d dɜdʐd˄ZUeIeAjVge4 dsd͐d΄ZWeIeAjXjReAjYjRgdtdϐdЄZZeIeAj[jRgdudѐd҄Z\eIeAj]jRdӐdԄ Z]eIeAj^jReAj^jSge4 dvdՐdքZ^eIej@jAj_dאd؄ Z_eIej@jAj`dِdڄ Z`eIeAjae4 d d dddۜdܐd݄Zbdސd߄ ZceIeAjddd ZeeIeAjfdwddZgeIeAjhdxddZieIeAjjdyddZkeIeAjle4 d d dddZmeIeAjne4 ebeedddZoeIeAjpedddZqeIeAjre4d-deedddZreIeAjse4 eedddZsdd Ztdzeeeej eej ee ee eej eej ecd	ddZueIeAjve4 d{eeee ee eej edddZweIeAjxjRgd|ejejejejeej eej eej eej ecd	ddZyeIeAjze4 eebeced ddZ{eIeAj|e4 d}ddZ}eIeAj~e4 d~eeebececedddZ~eIeAjjRdeeae eaeb edd	d
Zdd Zdd ZeeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj eeAj dd ZeIeAje4 dd ZeIeAje4 d`dddZeIeAje4 d`dddZeeAjZeeAjZeeAjZd dl5Zd dlZd dlZdd Ze  dS (      N)Sequence)Enum)reducewraps)CallableOptionalTypeVarUnion)	ParamSpec)SymBoolSymFloatTensor)_add_op_to_registry_convert_out_paramsglobal_decomposition_table
meta_table)
OpOverload)_prim_elementwise_meta$ELEMENTWISE_PRIM_TYPE_PROMOTION_KINDview_of)BoolLikecorresponding_complex_dtypecorresponding_real_dtypedefinitely_contiguouselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KIND	FloatLikeIntLikeis_contiguousmake_contiguous_strides_forNumbersuggest_memory_format
TensorLike)_maybe_convert_to_dtype_maybe_resize_out_resize_output_check_safe_copy_outout_wrapper)_broadcast_shapes_maybe_broadcast)_config)_pytree_T_PatenZIMPLMeta   )returnc                    s    fdd}|S )Nc                    s$   t    fdd}t|  S )Nc                    s   t t|   d S N)r   r   opfn G/var/www/auris/lib/python3.9/site-packages/torch/_meta_registrations.pyregisterA   s    z0register_meta.<locals>.wrapper.<locals>.register)r   pytreeZ	tree_map_)r6   r9   r3   r5   r8   wrapper>   s    zregister_meta.<locals>.wrapperr7   )r4   r;   r7   r3   r8   register_meta=   s    	r<   type_promotionc                    s>   t j|d| i\}  fdd|D }t| }t|dtjiS )Ntype_promotion_kindc                    s   g | ]}t | qS r7   )r#   .0xresult_dtyper7   r8   
<listcomp>S       z$elementwise_meta.<locals>.<listcomp>r>   )utilsr   r)   r   r   DEFAULT)r>   args_r7   rC   r8   elementwise_metaJ   s    
rK   c                 C   s(   t jt jt jt jt jt ji}|| | S r2   )torchZ	complex32halfcfloatfloatcdoubledoubleget)dtypeZfrom_complexr7   r7   r8   toRealValueType^   s
    rT   c                    s2   t tg|R   t k fdd d S )Nc                      s   d d  S )Nzoutput with shape z# doesn't match the broadcast shape r7   r7   Zbroadcasted_shape
self_shaper7   r8   <lambda>k   rF   z)check_inplace_broadcast.<locals>.<lambda>)tupler(   rL   _check)rV   Z
args_shaper7   rU   r8   check_inplace_broadcastg   s
    rZ   Fc	           	         s  t tjr$t dkdd  t tjrHt dkdd  tdd fD rtt  d u r| qtt	 fdd npt t tj
sJ tt tfdd t tsJ tdkd	d  tjf|d
||dS )Nr   c                   S   s   dS Nz:linspace only supports 0-dimensional start and end tensorsr7   r7   r7   r7   r8   rW      rF   z(meta_linspace_logspace.<locals>.<lambda>c                   S   s   dS r[   r7   r7   r7   r7   r8   rW      rF   c                 s   s   | ]}t |tV  qd S r2   )
isinstancecomplex)rA   argr7   r7   r8   	<genexpr>   rF   z)meta_linspace_logspace.<locals>.<genexpr>c                      s   d  d S )Nzlinspace(): inferred dtype z& can't be safely cast to passed dtype r7   r7   )default_complex_dtyperS   r7   r8   rW      rF   c                      s*   dt j dt  j dt j dS )Nz4received an invalid combination of arguments - got (, ))type__name__r7   )endstartstepsr7   r8   rW      s   c                   S   s   dS )Nz$number of steps must be non-negativer7   r7   r7   r7   r8   rW      rF   metarS   layoutdevice
pin_memoryrequires_grad)r\   rL   r   rY   dimanyrG   r   get_default_dtypeis_complex_dtyperS   _check_typer   empty)	rf   re   rg   baserS   rk   rj   rl   rm   r7   )r`   rS   re   rf   rg   r8   meta_linspace_logspaceo   sH    

ru   c                    sN   t  jt jk fdd t |  dko6  dk dd  |  jS )Nc                      s   d j  S )Nz2take(): Expected a long tensor for index, but got rS   r7   indexr7   r8   rW      rF   zmeta_take.<locals>.<lambda>r   c                   S   s   dS )Nz*take(): tried to take from an empty tensorr7   r7   r7   r7   r8   rW      rF   )rL   rY   rS   long_check_indexnumel	new_emptyshape)selfrx   r7   rw   r8   	meta_take   s    

r   rn   c                   sh   j }j }t||kdd  t dko> dk fdd tjj}|S )Nc                   S   s   dS )Nz=linalg.cross: inputs must have the same number of dimensions.r7   r7   r7   r7   r8   rW      rF   zlinalg_cross.<locals>.<lambda>r0   c                      s"   d  d   d   S )Nzlinalg.cross: inputs dimension z must have length 3. Got  and sizer7   rn   otherr~   r7   r8   rW      s
    )ndimrL   rY   r   r(   r}   r|   )r~   r   rn   Zx_dZy_d	out_shaper7   r   r8   linalg_cross   s    r   c                    s  ddl m mm}  fdd}fdd}t| dkrJdgt| S ttj| d}||dk}|rz||| |rz|S dgt| }	|rtt|d ddD ]@}
|
t|d krd|	|
< qt	||
d  d|	|
d   |	|
< q|	S t|d }
|d }d}d}tt| d ddD ]}|| | 9 }|dks`|| |d  dkr|||d  || kr|
dkr|||k s|||
 dkr|| |	|
< |||
 9 }|
d8 }
q`|||kr d S |dkr||d  }d}d}q|
dkrd S |	S )	Nr   )guard_or_falseguard_or_truesym_eqc                    s   r | S | S r2   r7   rB   )r   size_obliviousr7   r8   maybe_guard_or_false   s    z-_compute_stride.<locals>.maybe_guard_or_falsec                    s   r | S | S r2   r7   r   )r   r   r7   r8   maybe_guard_or_true   s    z,_compute_stride.<locals>.maybe_guard_or_true   r   )
%torch.fx.experimental.symbolic_shapesr   r   r   lenr   operatormulrangemax)Z	old_shapeZ
old_stride	new_shaper   r   r   r   r{   Z
zero_numel
new_strideZview_dZchunk_base_strideZtensor_numelZ
view_numelZtensor_dr7   )r   r   r   r8   _compute_stride   s^    





r   c                    sV   ddl m  t fdd|  D pTt fdd|  D pTt fdd|D S )Nr   has_hintc                 3   s   | ]} | V  qd S r2   r7   rA   sr   r7   r8   r_     rF   z+_view_has_unbacked_input.<locals>.<genexpr>c                 3   s   | ]} | V  qd S r2   r7   r   r   r7   r8   r_     rF   c                 3   s   | ]} | V  qd S r2   r7   r   r   r7   r8   r_     rF   )r   r   ro   r   stridear}   r7   r   r8   _view_has_unbacked_input  s    r   Tc                    s  ddl m}m} tjddt   jdkrv }D ] }t	|dk tj
|d}q@| u rrt S |S tdkr } jD ] }t	|dk tj
|d}q| u rt S |S ttjd}t	  |k fdd tt jkr|| jrt S |r.t rNn
t rNt} |S t    |d	}	|	d ur| |	S |rtjjjjst rt dd
S d j d   d d}
t |
d S )Nr   )r   r   F)validater   r   c                      s   d j  d dS )Nz&Could not reshape a tensor with shape  as a tensor with shape !r}   r7   r   r7   r8   rW   E  rF   z%_view_unbacked_meta.<locals>.<lambda>)r   )size_oblivious_enabledz Cannot view a tensor with shape z and strides r   r   )!r   r   r   rG   Zextract_shape_from_varargsZ
infer_sizer{   r   rL   rY   _refs	unsqueezer   r   r}   Zsqueezer   r   r   r   r   r   
as_stridedr   r   r   fxexperimentalr*   backed_size_obliviousr   _view_unbacked_meta
ValueError)r   r}   r   r   r   Z_alengthZshape_numelstridesZnew_stridesmsgr7   r   r8   r   !  sT    


&


r   c                 G   s>   t jjjjst| |r t| |S t jj| g|R ddiS d S )NZ
allow_copyF)	rL   r   r   r*   r   r   r   r   Z_reshape_view_helperr   r7   r7   r8   
_view_metae  s
    
r   c                 C   s$   t | d t| d tj| tjdS )Nzlinalg.matrix_expmemory_format)squareCheckInputscheckFloatingOrComplexrL   
empty_likecontiguous_formatr~   r7   r7   r8   linalg_matrix_expo  s    

r   valuesindicesc                 C   sV   t j| j| j| jd}t j| j| jt jd}|  dkrN| jdkrNt|| j ||fS )Nrk   rS   r   )	rL   rs   r}   rk   rS   int64r{   r   maybe_wrap_dim)r~   rn   r   r   r7   r7   r8   	cummaxminw  s
    r   c                 C   s   t || j tj| tjdS Nr   )r   r   rL   r   r   )r~   rn   r7   r7   r8   logcumsumexp  s    r   c                   s  |j }t|}|| }tt|}dd t|D }	|D ]}
d|	|
< q8g g  }}|D ]"}
|	|
 sl||
 qT||
 qT|| }t|}|  |d | }|j fdddd |||d   }||}dgt|j|d   }|	|}|
d}||d< t|}tt|D ]}|||  ||d	 < q| j|tjd
 dd t|D }d	}|d	 }|dkr|| d ||| < ||||  9 }|d	8 }q^t||D ] }| d	||  ||| < q| |||   | S )Nc                 S   s   g | ]}d qS )Fr7   rA   rJ   r7   r7   r8   rE     rF   z_exec_fft.<locals>.<listcomp>Tc                    s    |  S r2   r7   r   Zself_stridesr7   r8   rW     rF   z_exec_fft.<locals>.<lambda>keyreverser   r   r   r   c                 S   s   g | ]}d qS r   r7   r   r7   r7   r8   rE     rF   )r   r   listr   appendr   sortpermuter}   Zreshaper   resize_rL   r   as_strided_storage_offset)outr~   	out_sizesrn   forwardr   Zsignal_ndim
batch_dimsZdim_permuteZis_transformed_dimdleftrightZ	batch_endtmpinputZbatched_sizes
batch_sizeZbatched_out_sizesiZout_stridesZbatch_numelr7   r   r8   	_exec_fft  sL    





r   )r~   rn   exclude_lastc                    s<   t |}|   |d t|t|  j fddd |S )Nc                    s    |  S r2   r7   r   r   r7   r8   rW     rF   z_sort_dims.<locals>.<lambda>)r   )r   r   r   intr   )r~   rn   r   sorted_dimsr7   r   r8   
_sort_dims  s    
r   c                 C   sH   t | jj |s|  S t| |}| |  }t|| |  ||dS )Nr   )	rL   rY   rS   
is_complexcloner   r|   r   r   )r~   rn   normalizationr   r   r   r7   r7   r8   meta_fft_c2c  s    
r   c                 C   s<   t | tks0t | dkr4| d dkr4| d dkr4dS dS d S )N   r   r   FT)r   cufft_max_ndimr   r7   r7   r8   use_optimized_cufft_path  s    0r   c                    s  t | jj t|  }t|}|d }|| d d }t|}|||< |rV|||< t| dkspt| dkr| j|t	| jd}	| }
t| dkrt
|rt|	|
||dd nt|dkr|n|}t|	|
||gdd t|dkr| j|t	| jd}
|d d }|r|
|	 }	}
|
  |j fd	d
dd ttt|}|t|| d  }t|	|
||dd |d t||  }q|s|	||| kr|
j|t jd |
}	|	S | j|t	| jdS d S )Nr   r   r   cudaZxpurv   Tr   c                    s    |  S r2   r7   r   r   r7   r8   rW     rF   zmeta_fft_r2c.<locals>.<lambda>r   r   )rL   rY   rS   is_floating_pointr   r   device_hintr|   rG   r   r   r   r   r   r   minr   r   r   )r~   rn   r   Zonesidedinput_sizesr   Zlast_dimZlast_dim_halfsizeZonesided_sizesoutputZworking_tensorZtarget_sizesr   Zmax_dimsZ	last_dimsr7   r   r8   meta_fft_r2c  sV    

r   )	generatorc                C   s   t |t| gS r2   )r$   rL   Size)nr   r   r7   r7   r8   meta_randperm%  s    r   rS   rj   rk   rl   c                C   s   t j| ||||dS Nr   rL   rs   )r   rS   rj   rk   rl   r7   r7   r8   meta_randperm_default*  s    	
r   c                   s2   dt  k fdd t j|||||dS )Nr   c                      s   d d  S Nz:random_ expects 'from' to be less than 'to', but got from=z >= to=r7   r7   highlowr7   r8   rW   F  rF   zmeta_randint.<locals>.<lambda>r   rL   rY   rs   )r   r   rS   rj   rk   rl   r7   r   r8   meta_randint8  s    
r   c                   s.   t  k fdd t j|||||dS )Nc                      s   d d  S r   r7   r7   r   r7   r8   rW   [  rF   z"meta_randint_low.<locals>.<lambda>r   r   )r   r   r   rS   rj   rk   rl   r7   r   r8   meta_randint_lowM  s    
r   c                C   s   t j| ||||dS r   r   )r   rS   rj   rk   rl   r7   r7   r8   meta_rand_defaultb  s    
r  )r~   rn   r   lastdimc           
      C   s0  t | jj t| dkrt|  }|||d < | j|t| jd}t	|rjt
|| jt jd||ddS t|dkrt| |d d d|}n| jt jd}t
||||d gddS nv| }t|dkr|d d }t| ||dd}|dd  }t| }|||d < | j|t| jd}	t
|	|||ddS d S )	Nr   r   rv   r   Fr   r   r   )rL   rY   rS   r   r   r   r   r|   rT   r   r   r   r   r   r   )
r~   rn   r   r  r   r   tempr   Zc2c_dimsr   r7   r7   r8   meta_fft_c2rj  s4    	r  c                 C   sf   ddl m} || s*t| dkr*tdt|trb|| |}|  | krbt	j
||   | S )Nr   )free_unbacked_symbolsr   zQmore than one element of the written-to tensor refers to a single memory location)r   r  rL   Z_debug_has_internal_overlapRuntimeErrorr\   r   tor   r.   Zexpand_copydefault)r~   srcZnon_blockingr  Zintermediater7   r7   r8   
meta_copy_  s    
r
  c                 C   sX   t |  }t |  }||  kr(dn|| ||  }||d ||| ||fS Nr   )r   r   r   rn   insert)tensorrn   Zresult_sizesZresult_stridesr   r7   r7   r8   inferUnsqueezeGeometry  s     r  c                 C   s0   t ||  d }t| |\}}| || | S r  )r   rn   r  r   )r~   rn   Zg_sizesZ	g_stridesr7   r7   r8   meta_unsqueeze_  s    r  )r   weight_metabias_activation_opt	out_dtypec           	      C   s   t | j}|d ur.|d|dks.J d|d| dd ksJJ |d|d< t| jdksnJ dd| df}|d ur| jtjkr|tjksJ d| j||d u r| jn|d	||}|S )	Nr   zoutput size mismatchr   r   r   z*we can only handle the squashed input case9out_dtype is only supported for i8i8->i32 linear operatorrv   )
r   r}   r   r   rS   rL   int8int32r|   r   )	r   r  r  r  r  r  output_sizesZtransposed_stridesr   r7   r7   r8   meta_sparse_structured_linear  s$    	
r  )mat1	mat1_metamat2r  c                 C   s   t | jdksJ t |jdks$J t |jdks6J | d|dd ksRJ | d|dg}|d ur|jtjkr|tjksJ d|j||d u r|jn|d}|S )Nr   r   r   r  rv   r   r}   r   rS   rL   r  r  r|   )r  r  r  r  r  r   r7   r7   r8   meta_sparse_structured_mm  s    r  r   )alphabetar  )r   r  r  r  r  c          	      C   s   t | jdksJ dt |jdks(J t |jdks:J t |jdksLJ | d|dkshJ d|d|dd ksJ |d|dg}|d ur|jtjkr|tjksJ d|j||d u r|jn|d}|S )Nr   zEonly input broadcasted to columns of mat1 * mat2 product is supportedr   r   r  rv   r  )	r   r  r  r  r  r   r  r  r   r7   r7   r8   meta_sparse_structured_addmm  s(    r!  )	compressed_Adense_Br  r  r  transpose_resultalg_idsplit_ksplit_k_modec	                 C   s  |j tjtjtjtjtjhv s&J d| j |j ks:J dt|jdksPJ d| j tjtjfv }	|	rjdnd}
|	r|	 rJ d|
d}|
d	}|  d
 |
|  }|d ur||
dksJ |d ur|	r|tjtjtjtjhv sJ d|r||fn||f}|j||dS )Nz;_cslt_sparse_mm only supports fp16, bf16, int8, and fp8e4m3zinputs must have the same dtyper   z'_cslt_sparse_mm only supports 2d inputs
   	   z.dense input must be transposed for 8bit dtypesr   r      z\out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!rv   )rS   rL   float32float16bfloat16r  float8_e4m3fnr   r}   r   r   r{   r  r|   )r"  r#  r  r  r  r$  r%  r&  r'  Zis_8bit_input_typeZcompression_factorkr   moutput_shaper7   r7   r8   meta__cslt_sparse_mm  sB    


r2  )include_self)r~   rn   rx   sourcer   r3  r1   c                C   s   t j| t jdS r   rL   r   r   r~   rn   rx   r4  r   r3  r7   r7   r8   meta_index_reduceL  s    
r7  c                C   s   | S r2   r7   r6  r7   r7   r8   meta_index_reduce_Y  s    
r8  c                 C   s.   t |  }|  dkr$| ||< | |S Nr   )r   r   rn   r{   r|   )r~   rn   rx   result_sizer7   r7   r8   meta_index_selectg  s    r;  )lengthsr   offsetsaxisunsafeinitial)datar   r<  r   r=  r>  r?  r1   c          
         sj   |d urt d fdd}|d ur0||jS |d ur^|jd d |jd d f }	||	S tdd S )Nz?segment_reduce(): indices based reduction is not supported yet.c                    s(   t j| j d d   jdt jdS )Nr   rh   rS   rk   r   )rL   rs   r}   rS   r   )lengths_shaper>  rA  r7   r8   segment_reduce_lengths_tensor  s    z:meta_segment_reduce.<locals>.segment_reduce_lengths_tensorr   r   z<segment_reduce(): Either lengths or offsets must be defined.)NotImplementedErrorr}   r  )
rA  r   r<  r   r=  r>  r?  r@  rE  rC  r7   rD  r8   meta_segment_reducep  s    
rG  c                 C   s
   |  dS Nr7   r|   r   r7   r7   r8   meta_max  s    rJ  c                 C   s6   t | j|f}t| ||}| || j|tjdfS Nrv   rG   reduction_dimsr}   _compute_reduction_shaper|   rL   ry   r~   rn   keepdimr1  r7   r7   r8   meta_max_dim  s
    rQ  c                 C   s
   |  dS rH  rI  r   r7   r7   r8   meta_min  s    rR  c                 C   s6   t | j|f}t| ||}| || j|tjdfS rK  rL  rO  r7   r7   r8   meta_min_dim  s
    rS  c                 C   s4   |   rt| j}nt| tjd\}}tj| |dS Nr?   rv   )r   r   rS   r   r   INT_TO_FLOATrL   r   )r~   rD   rJ   r7   r7   r8   
meta_angle  s    
rW  c                 C   s$   t ||  | j |t | S r2   )rL   Z_resize_output_r   rk   copy_angle)r~   r   r7   r7   r8   meta_angle_out  s    rZ  c                 C   s   d S r2   r7   )valr7   r7   r8   assert_async  s    r\  c                 C   s   d S r2   r7   )r[  
assert_msgr7   r7   r8   assert_async_meta  s    r^  c                 C   s   d S r2   r7   )r   r7   r7   r8   
print_meta  s    r_  rS   rj   rk   rl   r   c                 C   s   t jdddS )Nr   rh   rk   r   r`  r7   r7   r8   make_dep_token  s    	rb  c                 C   s4   ddl m} t| ttfr"td|| ||d d S )Nr   )constrain_range'Constraining SymFloat or Symbool is nyir   r   )r   rc  r\   r   r   r   )r   r   r   rc  r7   r7   r8   sym_constrain_range  s    rf  c                 C   s   t j| ||d |S Nre  )r.   rf  r   r   r   	dep_tokenr7   r7   r8   functional_sym_constrain_range  s    rj  c                 C   s   ddl m} |d u r*|d u r*t|  d S t| ttfr@tdt| t	u r||d urbt
| |k |d urxt
| |k d S || ||d d S )Nr   )_constrain_range_for_sizerd  re  )r   rk  rL   _check_is_sizer\   r   r   r   rc   r   rY   )r   r   r   rk  r7   r7   r8   sym_constrain_range_for_size  s    
rm  c                 C   s   t j| ||d |S rg  )r.   rm  rh  r7   r7   r8   'functional_sym_constrain_range_for_size  s    rn  c                 C   s   |S r2   r7   )r[  r]  ri  r7   r7   r8   functional_assert_async_meta  s    ro  r~   f_namec                 C   sX   |   dksJ | d| d| dksTJ | d| d d| d dd S )Nr   z3: The input tensor must have at least 2 dimensions.r   z5: A must be batches of square matrices, but they are  by 	 matrices)rn   r   rp  r7   r7   r8   r     s     r   r~   Anamec                    s   t j jk fdd t j jk fdd t  d dk fdd t  ddk fdd d S )Nc                      s   dj  d j  dS )Nz:Expected b and A to be on the same device, but found b on z
 and A on 	 instead.ra  r7   rv  r~   r7   r8   rW      s
    z(linearSolveCheckInputs.<locals>.<lambda>c                      s   dj  d j  dS )Nz=Expected b and A to have the same dtype, but found b of type z and A of type rx  rv   r7   ry  r7   r8   rW   (  s
    r   rr  c                      s   d  d d  d dS )Nz3A must be batches of square matrices, but they are rr  rs  r   rt  r   r7   rv  r7   r8   rW   0  s
    c                      s:   d d  d d  d d d d d 
S )NzIncompatible matrix sizes for z: each A matrix is r   rs  z but each b matrix is rr  r   r7   rv  rw  r~   r7   r8   rW   8  s    )rL   rY   rk   rS   r   ru  r7   r{  r8   linearSolveCheckInputs  s     


r|  trq  allow_low_precision_dtypesc                    sZ   | j  t|  p|   fdd |sVt tjtjtjtjfv  fdd d S )Nc                      s    d  S )Nz<: Expected a floating point or complex tensor as input. Got r7   r7   rS   rq  r7   r8   rW   I  rF   z(checkFloatingOrComplex.<locals>.<lambda>c                      s    d  S )Nz*: Low precision dtypes not supported. Got r7   r7   r  r7   r8   rW   N  rF   )	rS   rL   rY   r   r   rO   rQ   rN   rP   r}  r7   r  r8   r   A  s    r   rv  rv  rq  arg_namec                    s"   t |  dk fdd d S )Nr   c                      s    d  dS )Nz: The input tensor z! must have at least 2 dimensions.r7   r7   r  rq  r7   r8   rW   V  rF   zcheckIsMatrix.<locals>.<lambda>)rL   rY   rn   r  r7   r  r8   checkIsMatrixS  s    
r  rv  Br   rq  c                    sZ   t   t tr0 ddkn ddk fdd d S )Nrr  r   c                      sH    drdnd d  d d  d d d d d d	S )
Nz2: Incompatible shapes of A and B for the equation zAX = BzXA = Bz (rr  rB   r   r   rb   r   r7   rv  r  rq  r   r7   r8   rW   _  s    
z#checkInputsSolver.<locals>.<lambda>)r   r  rL   rY   r   r  r7   r  r8   checkInputsSolverZ  s    

*r  resultfn_namer  r   result_namec                    s&   t jjk fdd d S )Nc                	      s$     d d dj  dj  	S )Nz: Expected z5 and input tensors to be on the same device, but got z on z and input on ra  r7   r  r   r  r  r7   r8   rW   o  s    z!checkSameDevice.<locals>.<lambda>)rL   rY   rk   r  r7   r  r8   checkSameDeviceg  s    
r  UPLOc                    s8      }tt dko&|dkp&|dk fdd d S )Nr   ULc                      s
   d  S )Nz1Expected UPLO argument to be 'L' or 'U', but got r7   r7   r  r7   r8   rW   z  rF   zcheckUplo.<locals>.<lambda>)upperrL   rY   r   )r  ZUPLO_uppercaser7   r  r8   	checkUplov  s
    
r  eigenvaluesZeigenvectorsr  )rv  r  	compute_vc                 C   sp   t | d t| t| j}|r@| |}||t|dd n| dg}|  | j|t| j	d}||fS )Nzlinalg.eighFZ	row_majorr   rv   )
r   r  r   r}   r|   r   r   poprT   rS   )rv  r  r  r}   Zvecsvalsr7   r7   r8   meta__linalg_eigh~  s    


r  )r   r1   c                 C   s@   t | d t| jr| jn
t| j}| j| jd d |dS )Nzlinalg.eigvalsr   rv   r   rG   rq   rS   r   r|   r}   )r   complex_dtyper7   r7   r8   meta__linalg_eigvals  s    


r  r   c                 C   sX   t | d t| jr| jn
t| j}| j| jd d |d}| j| j|d}||fS )Nz
linalg.eigr   rv   r  )r   r  r   Zvectorsr7   r7   r8   meta_linalg_eig  s    


r  )r	  r1   c                 C   s   | j jtjdddS )Nr   rr  r   )ZmTr   rL   r   	transpose)r	  r7   r7   r8   cloneBatchedColumnMajor  s    r  )r~   rv  r  r1   c                 C   s   t | S r2   )r  )r~   rv  r  r7   r7   r8   _cholesky_solve_helper  s    r  c                    sP   t jdkfdd t  jdk fdd t d\}}t|||S )Nr   c                      s   d j  dS )Nz-b should have at least 2 dimensions, but has  dimensions insteadr   r7   r   r7   r8   rW     rF   z cholesky_solve.<locals>.<lambda>c                      s   d j  dS )Nz-u should have at least 2 dimensions, but has r  r  r7   rz  r7   r8   rW     rF   cholesky_solve)rL   rY   r   !_linalg_broadcast_batch_dims_namer  )r~   rv  r  Zself_broadcastedZA_broadcastedr7   ry  r8   r    s    

r  )r~   r  r1   c                 C   s.   |   dkrtj| tjdS t| d t| S )Nr   r   cholesky)r{   rL   r   legacy_contiguous_formatr   r  r~   r  r7   r7   r8   r    s    
r  c                 C   s   t | d t| S )Ncholesky_inverse)r   r  r  r7   r7   r8   r    s    
r  )rv  r  check_errorsc                 C   sf   t | d t| d | j}t|}t|d}| |}||| | j|d|d  tjd}||fS )Nzlinalg.choleskyFr   r   rv   )	r   r   r}   r   r   r|   r   rL   r  )rv  r  r  ZA_shaper   Z	L_stridesr  infosr7   r7   r8   linalg_cholesky_ex  s    



r  )r   taur1   c                    s  t jdkdd  t ddkdd  t ddkdd  t jj dkfd	d jdkrjd d }jd d  t  |k fd
d t jjkfdd tdd t jjtjddjj	dS )Nr   c                   S   s   dS )NzHtorch.linalg.householder_product: input must have at least 2 dimensions.r7   r7   r7   r7   r8   rW     rF   z,linalg_householder_product.<locals>.<lambda>rr  r   c                   S   s   dS )Nzbtorch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz`torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]r7   r7   r7   r7   r8   rW     rF   r   c                      s   dj  d j  S )Nzptorch.linalg.householder_product: Expected tau to have one dimension less than input, but got tau.ndim equal to  and input.ndim is equal to r  r7   r   r  r7   r8   rW     s
    c                      s
   d  S )Nzltorch.linalg.householder_product: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r7   r7   actual_batch_tau_shaper7   r8   rW     s    c                      s   dj  d j  S )Nz,torch.linalg.householder_product: tau dtype z does not match input dtype rv   r7   r  r7   r8   rW     s    
z torch.linalg.householder_productr  Fr  r   r   rS   rk   )
rL   rY   r   r   r}   rS   r  empty_stridedr   rk   )r   r  Zexpected_batch_tau_shaper7   )r  r   r  r8   linalg_householder_product  sD    


r  )rv  r  c                 C   s^   t | d t| ddd | | j}|| jt| jdd | j| jd d tjd}||fS )Nzlinalg.inv_exF)r  r  rr  rv   r   r   r|   r}   r   r   rL   r  )rv  r  r  r  r7   r7   r8   linalg_inv_ex_meta  s    
r  LDpivotsinfo)	hermitianr  )r~   r  r  r1   c                C   st   t | d t| d tj| jt| jdd| j| jd}| j| jd d tj	d}| j| jd d tj	d}|||fS )Nztorch.linalg.ldl_factor_exFr  r  r   rv   rr  )
r   r   rL   r  r}   r   rS   rk   r|   r   )r~   r  r  r  r  r  r7   r7   r8   linalg_ldl_factor_ex_meta+  s    

r  )r  )r  r  r  r  r1   c                   s   t d td t d t jdk fdd jd d }t|jkfdd ttj	fdd tj	 j	k fdd t
 \}}tj|t|d	d
 j	 jdS )Nztorch.linalg.ldl_solver   c                      s   d j  dS )NzMtorch.linalg.ldl_solve: Expected B to have at least 2 dimensions, but it has r  r  r7   )r  r7   r8   rW   N  s    z'linalg_ldl_solve_meta.<locals>.<lambda>r   c                      s   d j  dS )Nzjtorch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, but got pivots with shape  insteadr   r7   r  r7   r8   rW   V  s    c                      s   d j  S )Nz<torch.linalg.ldl_solve: Expected pivots to be integers. Got rv   r7   r  r7   r8   rW   ]  rF   c                      s   dj  d j  S )Nz!torch.linalg.ldl_solve: LD dtype z does not match b dtype rv   r7   )r  r  r7   r8   rW   a  rF   Fr  r  )r   r   r|  rL   rY   r   r}   rG   is_integer_dtyperS   _linalg_broadcast_batch_dimsr  r   rk   )r  r  r  r  Zexpected_pivots_shapeB_broadcast_sizerJ   r7   )r  r  r  r8   linalg_ldl_solve_meta@  s6    	







r  Pr  )pivot)rv  r  r1   c          	         s   t  jdk fdd t j}|d }|d }t||}||d< |rV |}n dg}||d<  |}||d< ||d<  |}|||fS )Nr   c                      s   d j  dS )Nz@linalg.lu: Expected tensor with 2 or more dimensions. Got size: r  r   r7   rz  r7   r8   rW   q  rF   z linalg_lu_meta.<locals>.<lambda>rr  r   r   )rL   rY   r   r   r}   r   r|   )	rv  r  sizesr0  r   r/  r  r  r  r7   rz  r8   linalg_lu_metal  s$    




r  LU)r  r  )rv  r  r  r1   c          	         s   t  jdk fdd t j}|d }|d }t j|t|dd j jd}|	  t
|||d<  j|t jd	}|	   j|t jd	}|||fS )
Nr   c                      s   d j  dS )NzFtorch.lu_factor: Expected tensor with 2 or more dimensions. Got size: r  r   r7   rz  r7   r8   rW     rF   z*linalg_lu_factor_ex_meta.<locals>.<lambda>rr  r   Fr  r  rv   )rL   rY   r   r   r}   r  r   rS   rk   r  r   r|   r   )	rv  r  r  r  r0  r   r  r  r  r7   rz  r8   linalg_lu_factor_ex_meta  s&    


r  )r   adjoint)r  r  r  r   r  r1   c                   s   t d tj jk fdd tjtjkdd  td t |d tddkdd  tjd d jkfdd t	 \}}tj
|t|| d	 j jd
}| dkr|s| r| }|S )Nztorch.linalg.lu_solvec                      s   dj  d j  dS )NzPlinalg.lu_solve: Expected LU and B to have the same dtype, but found LU of type  and B of type r  rv   r7   )r  r  r7   r8   rW     s
    z&linalg_lu_solve_meta.<locals>.<lambda>c                   S   s   dS )NzElinalg.lu_solve: pivots should be a Tensor of scalar type torch.int32r7   r7   r7   r7   r8   rW     rF   zlinalg.lu_solver   c                   S   s   dS )NzYlinalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrixr7   r7   r7   r7   r8   rW     rF   c                      s   d j  dS )Nzclinalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, but got pivots with shape r  r   r7   r  r7   r8   rW     s    r  r  r   )r   rL   rY   rS   r   r   r  r   r}   r  r  r   rk   r{   r   Zconj)r  r  r  r   r  r  rJ   r  r7   )r  r  r  r8   linalg_lu_solve_meta  s<    




r  )r  r  unpack_dataunpack_pivotsr1   c                    s   t  jdk fdd |r6t |jt jkdd  t j}|d }|d }t||}||d< |rr |}n dg}|r||d<  |}	||d< ||d<  |}
n dg}	 dg}
||	|
fS )Nr   c                      s   d j  dS )NzFtorch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: r  r   r7   r  r7   r8   rW     rF   z lu_unpack_meta.<locals>.<lambda>c                   S   s   dS )Nztorch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.
Note: this function is intended to be used with the output produced by torch.linalg.lu_factorr7   r7   r7   r7   r8   rW     s    rr  r   r   )	rL   rY   r   rS   r  r   r}   r   r|   )r  r  r  r  r  r0  r   r/  r  r  r  r7   r  r8   lu_unpack_meta  s4    




r  )moder1   c                    sR    dkrd}d}n8 dkr$d}d}n& dkr6d}d}nt d fdd ||fS )NreducedTZcompleteFrc                      s   d  dS )Nzqr received unrecognized mode 'z=' but expected one of 'reduced' (default), 'r', or 'complete'r7   r7   r  r7   r8   rW     s    z _parse_qr_mode.<locals>.<lambda>rL   rY   )r  	compute_qr  r7   r  r8   _parse_qr_mode  s    
r  QRr  )rv  r  r1   c                 C   s   t | d t| d t|\}}| jd }| jd }t||}|r|t| j}|rT|n||d< | |}||t|dd n| dg}t| j}	|s|s|n||	d< | |	}
|
|	t|	dd ||
fS )Nz	linalg.qrrr  r   Fr  r   )	r  r   r  r}   r   r   r|   r   r   )rv  r  r  Zreduced_moder0  r   r/  ZQ_shaper  ZR_shaper  r7   r7   r8   linalg_qr_meta$  s"    








r  sign	logabsdet)rv  r1   c                 C   s   t | d t| dd | j}| |d d }| j|d d t| jd}tj|t|d| j| j	d}| j|d d tj
d}||||fS )Nzlinalg.slogdetFrr  rv   r  r   )r   r   r}   r|   rT   rS   rL   r  r   rk   r  )rv  r}   r  r  r  r  r7   r7   r8   _linalg_slogdet@  s    
r  )rv  full_matrices
compute_uvdriverc                 C   s   t | d t| d t| jd d }| jd }| jd }t||}|r|||rT|n|g }| |}	|	|t|dd ||r|n||g }
| |
}t| dk}||
t|
|d n| dg}	| dg}| j||g t	| j
d}|	||fS )	Nz
linalg.svdrr  r   Fr  r   r   rv   )r  r   r   r}   r   r|   r   r   r   rT   rS   )rv  r  r  r  r   r0  r   r/  ZU_shaper  ZV_shapeVZis_cudaSr7   r7   r8   _linalg_svd_metaT  s$    






r  )arg1arg2r1   c                 C   sn   | j d d }|j d d }t||}t|}|| d| dg7 }t|}||d|dg7 }||fS )Nrr  r   )r}   r(   r   r   )r  r  Zarg1_batch_sizesZarg2_batch_sizesexpand_batch_portionarg1_expand_sizearg2_expand_sizer7   r7   r8   r  z  s    
r  )r  r  rw  r1   c                 C   sV   |rt | || t| |\}}|| jkr,| n| |}||jkrD|n||}||fS r2   )r|  r  r}   expand)r  r  rw  r  r  Zarg1_broadcastedZarg2_broadcastedr7   r7   r8   r    s    r  )r   r   r1   c                 C   s6   | j d d }|jdkp0| jd |jko0|j |k}|S )Nr   r   )r}   r   )r   r   Zexpected_batched_rhs_shapevector_caser7   r7   r8   linalg_solve_is_vector_rhs  s
    
r  )r   r  r  r  r  r  )	rv  r  r   r  r  r  r  r  r1   c                   sl  t  d t jjk fdd t }|r@dn}	t |	|d t|	 \}
}t|pl| dd  |r|
d d n|
}tj|t	|| jj
d} j}tj|t	|d j j
d} j|d d tjd} j|d d	 tjd}||||f}||||f}td
d |D rht||D ]6\}}t||j ||j|  t||dd q0|S )Nzlinalg.solvec                      s   d j  dj  dS )NzKlinalg.solve: Expected A and B to have the same dtype, but found A of type r  r  rv   r7   rv  r  r7   r8   rW     s
    z"_linalg_solve_ex.<locals>.<lambda>r   c                   S   s   dS )Nzlinalg.solve: Vector broadcasting of the left hand side is not supported for left=False. In this case linalg.solve is equivalent to B / A.squeeze(-1)r7   r7   r7   r7   r8   rW     s    r  Frv   rr  c                 s   s   | ]}|d uV  qd S r2   r7   r@   r7   r7   r8   r_     rF   z#_linalg_solve_ex.<locals>.<genexpr>)	copy_fromcopy_toexact_dtype)r   rL   rY   rS   r  r   r  r  r  r   rk   r}   r|   r  allzipr$   r   r   r&   )rv  r  r   r  r  r  r  r  r  B_ZB_broad_shaperJ   Zresult_shapeZresult_r}   ZLU_Zpivots_Zinfo_r   resr  or7   r  r8   _linalg_solve_ex  sJ    



r  )r   unitriangularr   )rv  r  r  r   r  r   r1   c          	      C   s   |d u r|  dg}t|ts"J t| ||d t|| d \}}|dd oV| }|rjt||j	}n,t
||j	r||ddj	 |dd |S )Nr   zlinalg.solve_triangularrr  r   )r|   r\   r"   r  r  r  r   Zis_conjr$   r}   r%   r   
transpose_)	rv  r  r  r   r  r   r  ZA_Zavoid_copy_Ar7   r7   r8   linalg_solve_triangular_meta  s    
r  XM)r  )r~   rv  r  r  r  r1   c           	         s   t jdkfdd t  jdk fdd t d  jt jkrt \}}t j|t|ddj	j
d}t j|t|dd j	 j
d}n@ jt jks jt jkrt }d	g}nt dd
d  ||fS )Nr   c                      s   d j  dS )NzMtorch.triangular_solve: Expected b to have at least 2 dimensions, but it has r  r  r7   r   r7   r8   rW     s    z'triangular_solve_meta.<locals>.<lambda>c                      s   d j  dS )NzMtorch.triangular_solve: Expected A to have at least 2 dimensions, but it has r  r  r7   rz  r7   r8   rW     s    triangular_solveFr  r  r   c                   S   s   dS )Nz+triangular_solve: Got an unexpected layout.r7   r7   r7   r7   r8   rW   (  rF   )rL   rY   r   r|  rj   stridedr  r  r   rS   rk   
sparse_csr
sparse_bsrr   r|   )	r~   rv  r  r  r  Zself_broadcast_sizeZA_broadcast_sizeZsolutionZcloned_coefficientr7   ry  r8   triangular_solve_meta  s8    	




r  c                 C   sp   t | d t| d | | jd d }| | j}|| jt| jdd | j| jd d tjd}|||fS )Nz
linalg.detrr  Fr  r   rv   r  )rv  Zdetr  r  r7   r7   r8   _linalg_det_meta-  s    

r  )r   r  r   r   r  r1   c                    s  t jdkdd  t jdkdd  |r4dndt j jd kfdd t j jd kfdd t jd jd kd	d  t jj d
kfdd t jjkfdd jdkrFjd d }jd d t |kfdd jd d  t  |k fdd t jjkfdd t jjkfdd tdd tdd t jjtjddjjdS )Nr   c                   S   s   dS )Nz3torch.ormqr: input must have at least 2 dimensions.r7   r7   r7   r7   r8   rW   E  rF   zormqr.<locals>.<lambda>c                   S   s   dS )Nz3torch.ormqr: other must have at least 2 dimensions.r7   r7   r7   r7   r8   rW   H  rF   rr  r   c                      s   d  dS )Ntorch.ormqr: other.shape[z0] must be greater than or equal to tau.shape[-1]r7   r7   left_size_conditionr7   r8   rW   N  rF   c                      s   d  dS )Nr   z"] must be equal to input.shape[-2]r7   r7   r  r7   r8   rW   R  rF   c                   S   s   dS )NzHtorch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]r7   r7   r7   r7   r8   rW   W  rF   r   c                      s   dj  d j  S )Nz[torch.ormqr: Expected tau to have one dimension less than input, but got tau.ndim equal to r  r  r7   r  r7   r8   rW   \  s
    c                      s   dj  d j  S )Nzhtorch.ormqr: Expected other to have the same number of dimensions as input, but got other.ndim equal to r  r  r7   r   r   r7   r8   rW   c  s
    c                      s
   d  S )NzWtorch.ormqr: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r7   r7   r  r7   r8   rW   n  s    c                      s
   d  S )NzYtorch.ormqr: Expected batch dimensions of other to be equal to input.shape[:-2], but got r7   r7   )actual_batch_other_shaper7   r8   rW   w  s    c                      s   d j  dj  S )NzPtorch.ormqr: Expected input and tau to have the same dtype, but input has dtype z and tau has dtype rv   r7   r  r7   r8   rW     s
    c                      s   d j  dj  S )NzRtorch.ormqr: Expected input and other to have the same dtype, but input has dtype z and other has dtype rv   r7   r  r7   r8   rW     s
    ztorch.ormqrr  r   Fr  r  )	rL   rY   r   r}   rS   r  r  r   rk   )r   r  r   r   r  Zexpected_batch_shaper7   )r  r  r   r  r   r  r8   ormqr;  sn    	






r  c                   s   t td  k fdd j}| d k}|}| }|rftd|D ]}|o`|dk}qLn"td|D ]}|o|dk}qpt |p| fdd d S )Nr   c                      s   dd   dt  S )Nzpadding size is expected to be r   z, but got: r   r7   )rn   paddingr7   r8   rW     rF   z,_padding_check_valid_input.<locals>.<lambda>r   r   c                      s    d d  d d  dj  S )N	Expected r   zD or r   zcD (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: r   r7   )rn   r   r7   r8   rW     s    )rL   rY   r   r   r   r   )r   r  rn   Z	input_dimZis_batch_modeZvalid_batch_modeZvalid_non_batch_moder   r7   )rn   r   r  r8   _padding_check_valid_input  s"    r	  c                   s   d}d d}j dkr0d} d7  |d7 }t|dd |\|}   |rtk o|k  fdd tdkfdd j dkr|fS ||fS d S )	Nr   r   r0   r   c                      s   d d d  dj  S NzcArgument #4: Padding size should be less than the corresponding input dimension, but got: padding (ra   ) at dimension 
 of input r   r7   dim_wr   pad_lpad_rr7   r8   rW     s    z_pad1d_common.<locals>.<lambda>c                      s   d  d S )Nz
input (W: z%) is too small. Calculated output W: r7   r7   )input_woutput_wr7   r8   rW     rF   r   )r   r   r	  rL   rY   r|   )r   r  is_reflection	dim_planenbatchnplaner7   )r  r   r  r  r  r  r8   _pad1d_common  s0    




r  c                 C   s   t | |ddS NTr  )r  r   r  r7   r7   r8   meta_reflection_pad1d  s    r  c                    s*   t  jt jk fdd t |ddS )Nc                      s   d j   dS )Nz)"replication_pad1d" not implemented for ''rS   __str__r7   r  r7   r8   rW     rF   z(meta_replication_pad1d.<locals>.<lambda>Fr  )rL   rY   rS   boolr  r  r7   r  r8   meta_replication_pad1d  s
    

r   c                   s   d |s t t|dkdd  jdkr2 d7  |\ }|  |rzt |k of|k  fdd t  k fdd jS )Nr   r   c                   S   s   dS )Nz padding size is expected to be 2r7   r7   r7   r7   r8   rW     rF   z(_pad1d_backward_common.<locals>.<lambda>r0   c                      s   d d d  dj  S r
  r   r7   r  r7   r8   rW     s    c                      s   d d   S Nz(grad_output width unexpected. Expected: , Got: r   r7   r  grad_outputr  r7   r8   rW     rF   rL   rY   r   r   r   r|   r}   )r$  r   r  r  r  r7   )r  r$  r   r  r  r  r8   _pad1d_backward_common  s$    

r&  
grad_inputc                 C   s   t | ||ddS r  r&  r$  r   r  r7   r7   r8   meta_reflection_pad1d_backward
  s    r*  c                 C   s   t | ||ddS )NFr  r(  r)  r7   r7   r8   meta_replication_pad1d_backward  s    r+  c                   s8  dd d}d}t |dd j}|dkrNd}d7  d7  |d7 }|\	
|} 
   	 |rtk o	k 	fdd t
k ök  
fdd tdkpdkfd	d jd
kr"|fS ||fS d S )Nr   r   r   r      c                      s   d d d  dj  S r
  r   r7   r  r7   r8   rW   0  s    z_pad2d_common.<locals>.<lambda>c                      s   d d d  dj  S NzcArgument #6: Padding size should be less than the corresponding input dimension, but got: padding (ra   r  r  r   r7   dim_hr   pad_bpad_tr7   r8   rW   7  s    c                      s   d  d d d S )Nz
input (H:  W: z%) is too small. Calculated output H: r7   r7   )input_hr  output_hr  r7   r8   rW   ?  s
    r0   r	  r   r   rL   rY   r|   )r   r  r  Z
dim_slicesr  r   r  r7   )r/  r  r   r3  r  r4  r  r0  r  r  r1  r8   _pad2d_common  sB    



r6  c                 C   s   t | |ddS r  )r6  r  r7   r7   r8   meta_reflection_pad2dK  s    r7  c                    s*   t  jt jk fdd t |ddS )Nc                      s   d j   dS )Nz)"replication_pad2d" not implemented for 'r  r  r7   r  r7   r8   rW   V  rF   z(meta_replication_pad2d.<locals>.<lambda>Fr  )rL   rY   rS   r  r6  r  r7   r  r8   meta_replication_pad2dQ  s
    

r8  c                    s   dd d}|j }| dkr6d7  d7  |d7 }|\}}}}|  }	| }
|	| | |
| | tkfdd t k fdd ||j S )Nr   r   r   r,  c                      s   d d   S r!  r   r7   r#  r7   r8   rW   x  rF   z%meta_pad2d_backward.<locals>.<lambda>c                      s   d d   S Nz)grad_output height unexpected. Expected: r"  r   r7   r/  r$  r4  r7   r8   rW   |  rF   )r}   rn   rL   rY   r   r|   )r$  r~   r  r  rV   r  r  r1  r0  r3  r  r7   )r/  r  r$  r4  r  r8   meta_pad2d_backward[  s,    
r;  c             	      s  ddd d}t |dd jdk}|rVd}d7 d7  d7  |d7 }|\
|}    
   	|r,tk oʈk fdd tk o
k 
fd	d tk ok  fd
d t	dkpJdkpJdk	fdd |r|||	fS |	fS d S )Nr0   r   r   r   r      c                      s   d d d  dj  S r
  r   r7   r  r7   r8   rW     s    z_pad3d_common.<locals>.<lambda>c                      s   d d d  dj  S r-  r   r7   r.  r7   r8   rW     s    c                      s   d d d  dj  S )NzcArgument #8: Padding size should be less than the corresponding input dimension, but got: padding (ra   r  r  r   r7   )dim_dr   pad_bkpad_fr7   r8   rW     s    c                      s(   d  d d d d d S )Nz
input (D:  H: r2  z%) is too small. Calculated output D: r7   r7   )input_dr3  r  output_dr4  r  r7   r8   rW     s    r5  )r   r  r  r  Z
batch_moder  r  r7   )r=  r/  r  r   rA  r3  r  rB  r4  r  r0  r>  r?  r  r  r1  r8   _pad3d_common  sP    





rC  c                 C   s   t | |ddS r  )rC  r  r7   r7   r8   meta_reflection_pad3d  s    rD  c                    s*   t  jt jk fdd t |ddS )Nc                      s   d j   dS )Nz)"replication_pad3d" not implemented for 'r  r  r7   r  r7   r8   rW     rF   z(meta_replication_pad3d.<locals>.<lambda>Fr  )rL   rY   rS   r  rC  r  r7   r  r8   meta_replication_pad3d  s
    

rE  c                    s(  t t|dkdd  |jdks&J j|jks6J ddd |jdkrdd7 d7  d7  |\}}}}}}| }	|}
|}|	| | |
| | || | t kfdd t kfd	d t  k fd
d ||jS )N   c                   S   s   dS )Nz padding size is expected to be 6r7   r7   r7   r7   r8   rW     rF   z%meta_pad3d_backward.<locals>.<lambda>r0   r   r   r<  c                      s   d d   S r!  r   r7   r#  r7   r8   rW     rF   c                      s   d d   S r9  r   r7   r:  r7   r8   rW     rF   c                      s   d d   S )Nz(grad_output depth unexpected. Expected: r"  r   r7   )r=  r$  rB  r7   r8   rW     rF   r%  )r$  r   r  r  r  r1  r0  r?  r>  rA  r3  r  r7   )r=  r/  r  r$  rB  r4  r  r8   meta_pad3d_backward  s<    




rG  r   )r~   pr1   c                 C   sb   t |  dd  | d}|dkr<| dgjt jdS | ||d  d fjt jdS d S )Nc                   S   s   dS )Nz(_pdist_forward requires contiguous inputr7   r7   r7   r7   r8   rW   	  rF   z%meta__pdist_forward.<locals>.<lambda>r   r   r   r   )rL   rY   r   r   r|   r  r  )r~   rH  r   r7   r7   r8   meta__pdist_forward 	  s    
rI  )gradr~   rH  pdistr1   c                 C   s8   t | dd  t | dd  t j|t jdS )Nc                   S   s   dS )Nz._pdist_backward requires self to be contiguousr7   r7   r7   r7   r8   rW   	  rF   z&meta__pdist_backward.<locals>.<lambda>c                   S   s   dS )Nz/_pdist_backward requires pdist to be contiguousr7   r7   r7   r7   r8   rW   	  rF   r   )rL   rY   r   r   r  )rJ  r~   rH  rK  r7   r7   r8   meta__pdist_backward	  s    rL  )r   r  c                   s  ddl m}m}  d} d}d}	|t|j|||	frZ|||	ft 	 dkdd  t	 dkdd  t
jstj j  kojkn   fd	d  j}
j|
d |
d td kod kfd
d  S )Nr   )r   r   r   r   r0   c                   S   s   dS Nzbatch1 must be a 3D tensorr7   r7   r7   r7   r8   rW   %	  rF   zmeta_baddbmm.<locals>.<lambda>c                   S   s   dS Nzbatch2 must be a 3D tensorr7   r7   r7   r7   r8   rW   &	  rF   c                      s   dj  d j  dj  S )Nz+Input dtypes must be the same, got: input: z
, batch1: z
, batch2: rv   r7   )batch1batch2r~   r7   r8   rW   *	  rF   c                	      s&   d d d d  d d  d	S Nz@Expected size for first two dimensions of batch2 tensor to be: [ra   z] but got: [r   r   ].r7   r7   batch2_sizesbscontraction_sizer7   r8   rW   2	  s    )r   r   r   r   rL   Zsym_notr}   r  rY   rn   
exp_configZ&skip_dtype_check_in_meta_registrationsrS   r|   )r~   rO  rP  r   r  r   r   dim1dim2Zdim3batch1_sizesr7   )rO  rP  rT  rU  rV  r~   r8   meta_baddbmm	  s,    


r[  c                C   s   t j| t jdS r   r5  r~   r   r7   r7   r8   meta_bernoulli:	  s    r]        ?c                 C   s   | S r2   r7   r~   rH  r   r7   r7   r8   meta_bernoulli_A	  s    r`  c                 C   s   t j| t jdS r   r5  r_  r7   r7   r8   meta_bernoulli_pF	  s    ra  c                 C   s
   t | S r2   rL   r   r\  r7   r7   r8   meta_poissonL	  s    rc  c                 C   s6   t |
|  k dd  t j| t jd}t | |fS )Nc                   S   s   dS )NzJError in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()r7   r7   r7   r7   r8   rW   d	  rF   z6meta__fused_moving_avg_obs_fq_helper.<locals>.<lambda>rv   )rL   rY   rn   r   r  )r~   Zobserver_onZfake_quant_onZrunning_minZrunning_maxscaleZ
zero_pointZaveraging_constZ	quant_minZ	quant_maxZch_axisZper_row_fake_quantZsymmetric_quantmaskr7   r7   r8   $meta__fused_moving_avg_obs_fq_helperR	  s    
rf  c                    sn   t |  dkdd  t | dkdd  | j\ |j\t  k fdd | S )Nr   c                   S   s   dS )Nza must be 2Dr7   r7   r7   r7   r8   rW   m	  rF   zmeta_mm.<locals>.<lambda>c                   S   s   dS )Nzb must be 2Dr7   r7   r7   r7   r8   rW   n	  rF   c                	      s   d d  d d d	S )Nz/a and b must have same reduction dim, but got [ra   z] X [rR  r7   r7   ZM1ZM2Nr  r7   r8   rW   s	  rF   )rL   rY   rn   r}   r|   r   br7   rg  r8   meta_mmj	  s    

rk  c                    s0   |r"t  fddtjD S tj S )Nc                 3   s$   | ]}| vrj | nd V  qdS )r   Nr   rA   r   dimsr~   r7   r8   r_   z	  rF   z+_compute_reduction_shape.<locals>.<genexpr>)rX   r   r   rG   compute_reduction_output_shaper}   )r~   rn  rP  r7   rm  r8   rN  x	  s    rN  strc                 C   sH   t | tjjr| jjS t| dr@t| jdr@| jjdkr@| jjS dS d S )Nrk   rc   rh   r   )r\   rL   Z_subclassesZ
FakeTensorZfake_devicerc   hasattrrk   )r  r7   r7   r8   r   	  s    

r   )input_tensorr  r   r  dilationis_transposedgroupsoutput_paddingc                    s:  t t t t t t ddd}t t t t t t t ddd}	|jdd  }
| jdd   |rb||jd  }n*|jd	 }|jd | | jd krtd
| jd	 |gt|tr|gt  }nt|dkr|d	 gt  }t|tr|gt  }n t|dkr|d	 gt  }t|tr(|gt  }n t|dkrH|d	 gt  }d }|rt|trn|gt  }n&t|dkr|d	 gt  }n|}tt D ]h}|r܈|	 | || || |
| || ||  n*| | || || |
| ||  qt	t
dd dd  D  fdd S )N)lnrH  r   r/  r   r1   c                 S   s$   | d|  ||d   d | d S )a  
        Formula to apply to calculate the length of some dimension of the output

        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
        Returns:
            The output length
        r   r   r7   )rw  rH  r   r/  r   r7   r7   r8   _formula	  s    z+calc_conv_nd_return_shape.<locals>._formula)rw  rH  r   r/  r   r4   r1   c                 S   s(   | d | d|  ||d   | d S )a  
        Formula to apply to calculate the length of some dimension of the output
        if transposed convolution is used.
        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
            op: output padding in that dim

        Returns:
            The output length
        r   r   r7   )rw  rH  r   r/  r   r4   r7   r7   r8   _formula_transposed	  s    z6calc_conv_nd_return_shape.<locals>._formula_transposedr   r   r   zInvalid channel dimensionsc                 s   s   | ]}|d kV  qdS r   Nr7   r@   r7   r7   r8   r_   	  rF   z,calc_conv_nd_return_shape.<locals>.<genexpr>c                      s   dt   ddd   dS )NzGiven input size per channel: z&. Calculated output size per channel: r   z. Output size is too small)r   r7   rn  Z	ret_shaper7   r8   rW   	  s   
z+calc_conv_nd_return_shape.<locals>.<lambda>)r   r}   r  r\   r   r   r   r   rL   rY   ro   )rr  r  r   r  rs  rt  ru  rv  rx  ry  kernel_sizeZout_channelsZoutput_padding_listr   r7   r{  r8   calc_conv_nd_return_shape	  sb    



"r}  c                 C   s   t j| t jkS r2   rL   _prims_commonr!   channels_lastZtenr7   r7   r8   is_channels_last	  s    r  )rr  r  r  running_meanrunning_vartrainingexponential_average_factorepsilonc                    s    j }|d ur|j n|j }	|d ur(|j n|j }
 fdd} |j| d}|rh |	} |
}n d} d}|||fS )Nc                      s(   t  rtjS  jtjdr"tjS tjS r   )r  rL   r  r   r   r7   rr  r7   r8   pick_memory_format
  s
    z2meta_miopen_batch_norm.<locals>.pick_memory_formatr   r   )r}   r|   r  )rr  r  r  r  r  r  r  r  r   Zsave_mean_shapeZsave_var_shaper  r   Z	save_meanZsave_varr7   r  r8   meta_miopen_batch_norm 
  s    


r  )	rr  r  r  r   r  rs  rt  rv  ru  c	              	      sf    fdd}	t  ||||||r&|nd }
d}d} |dkrJd|
|<  |
}|j|	 d}|S )Nc                      s^   t  dkr$t str2tjS nt r2tjS  jtjdrFtjS  jtjdrZtjS d S Nr   r   )r   r  rL   r  r   r   preserve_formatr7   rr  r  r7   r8   r  2
  s    z%meta_conv.<locals>.pick_memory_formatr   r   r   )r}  r   r|   r  )rr  r  r  r   r  rs  rt  rv  ru  r  	shape_outZinput_channels_dimZoutput_channels_dimr   r7   r  r8   	meta_conv&
  s$    

r  mkldnnc
              	   C   sH   t | ||||d|g }
| |
}tj}|  dkr8tj}|j|d}|S )NFr<  r   )r}  r|   rL   r  rn   channels_last_3dr  )rr  r  r  r  r   rs  ru  attrscalars	algorithmr  r   Zout_memory_formatr7   r7   r8   meta_mkldnn_convolution_defaultX
  s    
r  c                 C   s$   |  g | jd d |jd R S Nr   r   r|   r}   )rr  r  r  r  r  r  r7   r7   r8   meta_linear_pointwise_defaulto
  s    r  mklc                 C   s$   |  g | jd d |jd R S r  r  )rr  Zpacked_weightZorig_weightr  r   r7   r7   r8   meta_mkl_linearz
  s    r  onednnc              	   C   s|   t | ||||	d|
d }|tjtjtjtjfv s2J | j||d}t|dv sTJ dt|dkrftjntj	}|j
|d}|S )NFrv   r0   r,  zonly conv1d/2d are supportedr,  r   )r}  rL   r+  r-  uint8r  r|   r   r  r   r  )rB   x_scalex_zpww_scalew_zpr  r   r  rs  ru  output_scaleoutput_zero_pointoutput_dtyper  r  r  r  r   formatr7   r7   r8   meta_qconv_pointwise
  s     
r  c                 C   s   |dksJ |S )Nsumr7   )rB   r  r  r  r  r  accumr  r   r  rs  ru  r  r  r  Zaccum_scaleZaccum_zero_pointbinary_op_namer  unary_op_nameunary_op_argsunary_op_algorithmr7   r7   r8   meta_qconv2d_pointwise_binary
  s    r  c                 C   sF   t | j}|jd |d< |	tjtjtjtjfv s4J | j||	d}|S )Nr   r   rv   )r   r}   rL   r+  r-  r  r  r|   )rB   r  r  r  r  r  r  r  r  r  Zpost_op_nameZpost_op_argsZpost_op_algorithmr1  r   r7   r7   r8   meta_qlinear_pointwise
  s
    
r  c                 C   sR   |dkr|S t | j}|jd |d< |
tjtjtjtjfv s@J | j||
d}|S )Nr  r   r   rv   )r   r}   rL   r+  r-  r  r  r|   )rB   r  r  r  r  r  Zx_2r  r  r  r  Zx2_scaleZx2_zpr  r  r  r  r  r1  r   r7   r7   r8   meta_qlinear_pointwise_binary
  s    
r  c                 C   s&   t | j}|jd |d< | |}|S )Nr   r   )r   r}   r|   )rB   r  r  r1  r   r7   r7   r8   meta_linear_dynamic_fp16
  s    

r  	quantizedr7   r   r   c                 C   sr   t | |||||\}}}|  dkr.| dnd}	tj}
|  dkrP|||g}n|	|||g}tj|| j| j|
dS Nr,  r   r0   rB  )#max_pool2d_checks_and_compute_shapern   r   rL   r  rs   rS   rk   r   r|  r   r  rs  	ceil_modenInputPlaneoutputHeightoutputWidthr  r   r   r7   r7   r8   meta_quantized_max_pool2d  s$    r  c                 C   s   t |  dkd|   d t | dkd|  d t | jt jt jt jfv d| j  t |jt jkd|j  t |jt jkd|j  t |j| jkd|j  | j	| 
d	|
d	| jd
S )Nr   zx must be a 2D tensor, got Dzw must be a 2D tensor, got #expected x to be f32/f16/bf16, got expected w to be uint8, got z q_group_size must be int64, got z5q_scale_and_zeros must have the same dtype as x, got r   rv   )rL   rY   rn   rS   r+  r,  r-  r  r   r|   r   rB   r  q_group_sizeZq_scale_and_zerosr7   r7   r8   meta_int4mm_packed_weight_cpu+  s       




r  c                    s4   t   koj k fdd d S )Nc                      s8   d  d d dd   d dj   S )NzExpected a tensor of dimension z and tensor.size[z] == ra   zbut got : dimension z] = rn   r}   r7   rn   dim_sizer   r  r7   r8   rW   C  s   z check_dim_size.<locals>.<lambda>)rL   rY   rn   r}   )r  rn   r  r   r7   r  r8   check_dim_size@  s    r  c                    s  dd }|d|\}}	t t|dv dd  t  jt jt jt jt jfv fdd t|dkrp||	 }
}n.t|d	kr|d |d  }
}n|d
|\}
}|d|\}}t |d u p|dkdd    dkrވ 	dnd	} 	d} 	d} 	d}t
||||
d	|}t
||	||d	|}t }t ||	|
|||d	d	||||||   dkrl|||g}n||||g}t j| j j|dS )Nc                    sD   t t|dv  fdd |d }t|dkr4|n|d }||fS )Nr   r   c                      s   d  dS )Nzavg_pool2d: 4 must either be a single int, or a tuple of two intsr7   r7   rw  r7   r8   rW   U  rF   z1meta_avg_pool2d.<locals>.unpack.<locals>.<lambda>r   r   rL   rY   r   rw  r[  HWr7   r  r8   unpackR  s    

zmeta_avg_pool2d.<locals>.unpackr|  r   r   r   c                   S   s   dS NzOavg_pool2d: stride must either be omitted, a single int, or a tuple of two intsr7   r7   r7   r7   r8   rW   ^  rF   z!meta_avg_pool2d.<locals>.<lambda>c                      s   d j   dS )Nz""avg_pool2d" not implemented for 'r  r  r7   r  r7   r8   rW   b  rF   r   r   r   r  c                   S   s   dS Nzdivisor must be not zeror7   r7   r7   r7   r8   rW   o  rF   r,  r  rr  r   r0   rB  )rL   rY   r   rS   r  uint16uint32uint64rn   r   pooling_output_shaperG   r!   pool2d_shape_checkrs   rk   )r   r|  r   r  r  count_include_paddivisor_overrider  kHkWdHdWpadHpadWr  r  inputHeight
inputWidthr  r  r   r   r7   r  r8   meta_avg_pool2dH  sj    
	





r  c                 C   sj   t | ||||||dd|	|
|||| |  }|	}t|||d | t|||d | t|||d | d S )Nr   r0   r   )r  rn   r  )r   Z
gradOutputr  r  r  r  r  r  r  r  r  r  r  r  
mem_formatr   nOutputPlaner7   r7   r8   avg_pool2d_backward_shape_check  s,    r  c                 C   s  t t|dkpt|dkdd  |d }t|dkr<|n|d }	t t|dkpjt|dkpjt|dkdd  t|dkr|n|d }
t|dkr|	nt|dkr|
n|d }t t|dkpt|dkdd  |d }t|dkr|n|d }t |d u p|dkdd  |j}| d	kr2|d
 nd}|d }|d }|d }t||||
d|}t||	||d|}t|}t|| |||	|
||||||||| t j	||j
|j|dS )Nr   r   c                   S   s   dS )NzKavg_pool2d: kernel_size must either be a single int, or a tuple of two intsr7   r7   r7   r7   r8   rW     rF   z*meta_avg_pool2d_backward.<locals>.<lambda>r   c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )NzGavg_pool2d: padding must either be a single int, or a tuple of two intsr7   r7   r7   r7   r8   rW     rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   r,  r  r  rr  r   rB  )rL   rY   r   r}   rn   r  rG   r!   r  rs   rS   rk   )ZgradOutput_r   r|  r   r  r  r  r  r  r  r  r  r  r  
input_sizer  r  r  r  r  r  r  r7   r7   r8   meta_avg_pool2d_backward  sj    "(
r  c                    s@  t t|dv dd  |d }t|dkr0|n|d }t|dkrH|n|d }	t | pdt|dv dd  t  jt jt jt jt jfv fdd |s|n|d }
|s|nt|dkr|
n|d }|s|	nt|dkr|
n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr4|n|d }t  jd
v dd  t | pd|dkdd   	d} 	d} 	d} 	d} 	d}t
||||
d|}t
||||d|}t
||	||d|}t ||||	|
|||||ddd||||||ddd  jdkr( ||||fS  |||||fS d S )Nr   r0   c                   S   s   dS NzFavg_pool3d: kernel_size must be a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW     rF   z!meta_avg_pool3d.<locals>.<lambda>r   r   r   c                   S   s   dS NzJavg_pool3d: stride must be omitted, a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW   $  rF   c                      s   d j   dS )Nz""avg_pool3d" not implemented for 'r  r  r7   r  r7   r8   rW   (  rF   c                   S   s   dS NzBavg_pool3d: padding must be a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW   0  rF   r,  r<  c                   S   s   dS Nz9non-empty 4D or 5D (batch mode) tensor expected for inputr7   r7   r7   r7   r8   rW   8  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   =  rF   r  r  rr  r   zavg_pool3d()T)check_input_sizer,  )rL   rY   r   rS   r  r  r  r  r   r   r  pool3d_shape_checkr|   )r   r|  r   r  r  r  r  kTr  r  dTr  r  padTr  r  r  nslicesitimeiheightiwidthotimeoheightowidthr7   r  r8   meta_avg_pool3d  s    

  





r  c                 C   s  t t|dv dd  |d }t|dkr0|n|d }	t|dkrH|n|d }
t | pdt|dv dd  |sv|n|d }|s|	nt|dkr|n|d }|s|
nt|dkr|n|d }t t|dv dd  |d }t|dkr|n|d }t|dkr|n|d }t |jd	v d
d  t | p8|dkdd  |d}|d}|d}|d}t||||d|}t||	||d|}t||
||d|}t|| |||	|
||||||||||||d ||jS )Nr  c                   S   s   dS r  r7   r7   r7   r7   r8   rW   w  rF   z*meta_avg_pool3d_backward.<locals>.<lambda>r   r   r   c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   r  c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   r  r  rr  r   zavg_pool3d_backward())	rL   rY   r   r   r   r  avg_pool3d_backward_shape_checkr|   r}   )r$  r   r|  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Zotime_for_shape_checkZoheight_for_shape_checkZowidth_for_shape_checkr7   r7   r8   meta_avg_pool3d_backwardi  st    
  




r  c                    sZ   t  jdkp jdk fdd  jd d t| }t }t j| j j	|dS )Nr0   r,  c                      s   d j  S )Nz"Expected 3D or 4D tensor, but got r   r7   r   r7   r8   rW     rF   z*meta_adaptive_avg_pool2d.<locals>.<lambda>rr  rB  )
rL   rY   r   r}   rX   rG   r!   rs   rS   rk   )r~   output_sizer1  r   r7   r   r8   meta_adaptive_avg_pool2d  s    

r  c                    s@   t  jdkp jdk fdd   jd d t| S )Nr,  r<  c                      s   d j  S )Nz"Expected 4D or 5D tensor, but got r   r7   r   r7   r8   rW     rF   z*meta_adaptive_avg_pool3d.<locals>.<lambda>r  )rL   rY   r   r|   r}   rX   )r~   r  r7   r   r8   meta_adaptive_avg_pool3d  s
    
r  c                    s    j }td|D ]$t dk fdd qt|dkpH|dkfdd tj jk fdd tj}trtj}	j
j|d	S )
Nr   r   c                      s   d j  d dS )Nz{adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero                       size for non-batch dimensions,  with dimension  being emptyr   r7   )grad_outr   r7   r8   rW     s   z4meta__adaptive_avg_pool2d_backward.<locals>.<lambda>r0   r,  c                      s   d j  S )NzBadaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got r   r7   r   r7   r8   rW     rF   c                      s   dj  d j  S Nexpected dtype z! for `grad_output` but got dtype rv   r7   )r  r~   r7   r8   rW     rF   r   )r   r   rL   rY   r   rS   r   r  r  r|   r}   r  )r  r~   r   r   r7   )r  r   r~   r8   "meta__adaptive_avg_pool2d_backward  s$    

r  c                 C   s   t | d tj|tjdS )NZadaptive_avg_pool3d_backwardr   )!_adaptive_pool_empty_output_checkrL   r   r  r$  r~   r7   r7   r8   "meta__adaptive_avg_pool3d_backward  s    
r  )r$  r  c                    s<   j }td|D ]&tdk fdd qd S )Nr   r   c                      s     dj  d dS )Nzc(): Expected grad_output to have non-zero size for non-batch dimensions, but grad_output has sizes r   r  r   r7   r  r$  r   r7   r8   rW     s
    z3_adaptive_pool_empty_output_check.<locals>.<lambda>)r   r   rL   rY   r   )r$  r  r   r7   r	  r8   r    s    r  r   c                    s&  j }t|dv fdd td|D ]$ t dk fdd q(tt|dkdd  d}d}d}j d	krd}|d7 }|d }|\}}j d
kr|||f}|}	j|tjd}
|	|
fS ||||f}t	}|j
|d}	j|tjdj
|d}
|	|
fS d S )Nr  c                      s   d j  S )Nz:adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: r   r7   r  r7   r8   rW     rF   z*meta_adaptive_max_pool2d.<locals>.<lambda>r   r   c                      s   dj  d  dS )Nzjadaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r   r  r   r7   r   r   r7   r8   rW   	  s
    r   c                   S   s   dS )NzCadaptive_max_pool2d(): internal error: output_size.size() must be 2r7   r7   r7   r7   r8   rW     rF   r,  r0   rv   r   )r   rL   rY   r   r   r   r|   r   rG   r!   r  )r   r  r   ZdimHsizeBsizeDosizeHosizeWr   r   r   r   r7   r
  r8   meta_adaptive_max_pool2d  sD    







r  c                    sd    j }t|dv  fdd t d tj jk fdd t}jj	|dS )Nr  c                      s   d j  S )NzKadaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: r   r7   r$  r7   r8   rW   4  rF   z3meta_adaptive_max_pool2d_backward.<locals>.<lambda>adaptive_max_pool2d_backwardc                      s   dj  d j  S r  rv   r7   r$  r   r7   r8   rW   ;  rF   r   )
r   rL   rY   r  rS   rG   r!   r|   r}   r  )r$  r   r   r   r   r7   r  r8   !meta_adaptive_max_pool2d_backward.  s    



r  c                    s   j }t|dv fdd td|D ]$ t dk fdd q(tt|dkdd  d}d}d}|d	krd}|d7 }|}|\}}}|d
kr||||f}	n|||||f}	|	}
j|	tjd}|
|fS )Nr  c                      s   d j  S )Nz:adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: r   r7   r  r7   r8   rW   H  rF   z*meta_adaptive_max_pool3d.<locals>.<lambda>r   r   c                      s   dj  d  dS )Nzjadaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r   r  r   r7   r
  r7   r8   rW   M  s
    r0   c                   S   s   dS )NzCadaptive_max_pool3d(): internal error: output_size.size() must be 3r7   r7   r7   r7   r8   rW   U  rF   r<  r,  rv   )r   rL   rY   r   r   r   r|   r   )r   r  r   ZdimDr  r  ZosizeTr  r  r   r   r   r7   r
  r8   meta_adaptive_max_pool3dB  s8    





r  c                 C   s   t | d ||jS )Nadaptive_max_pool3d_backward)r  r|   r}   )r$  r   r   r7   r7   r8   !meta_adaptive_max_pool3d_backwardn  s    
r  c                 C   s   |d u rt d| |S )Nz:cannot repeat_interleave a meta tensor without output_size)r  r|   )repeatsr  r7   r7   r8   meta_repeat_interleave_Tensoru  s    r  c                 C   s:   | j jsJ |j jsJ t| j|j}| j|t| j dS rK  )rS   r   r(   r}   r|   r   )realimagr   r7   r7   r8   meta_complex|  s    r  )
fill_valuec                C   s   | j ||  ftjdS rK  )r|   rn   rL   ry   )r~   r   r  r7   r7   r8   nonzero_static  s    r  c                 C   s<   t tjdd  t j|  |  fd|  ft j| jdS )Nc                   S   s   dS )NaY  The register_meta function for torch.nonzero() raises unimplemented by default, as a correct data-independent implementation does not exist. This implementation returns a fake value, assuming all elements of the tensor are non-zero. To enable this registration, please set 'torch.fx.experimental._config.meta_nonzero_assume_all_nonzero' to True.r7   r7   r7   r7   r8   rW     rF   znonzero.<locals>.<lambda>r   rS   rk   )	rL   Z_check_not_implementedrW  Zmeta_nonzero_assume_all_nonzeror  r{   rn   ry   rk   r   r7   r7   r8   nonzero  s    
r  c              
      sH  t tdd  g }tD ]\d urt jt jt jt jt jfv dd  jt jt jfv r }t	|t 
j jkfdd tjD ]Ft 
j j  kfdd ||d qn
| q | q |t t	jkfdd dd lm} t|j t	jk r`d  qBd}d	}D ]J|dkrd urd}n*|dkrd u rd
}nd url qqld}|s<g }g }tD ](\d ur| | qtD ](\d u r| | q||g g  g tD ]H\}	d u rrz j|	  nj|	  n
tjqP fdd}
   }ddlm} | dkr|S |
}t|}t|ttt	|krDt|j|}t|}t|t|}|| |}|S )Nc                   S   s   dS )Nz#at least one index must be providedr7   r7   r7   r7   r8   rW     rF   z#meta_index_Tensor.<locals>.<lambda>c                   S   s   dS )Nz?tensors used as indices must be long, int, byte or bool tensorsr7   r7   r7   r7   r8   rW     rF   c                      s   d j  S )N)too many indices for tensor of dimension r  r7   r   r7   r8   rW     rF   c                	      s$   dj  d  dj  d  S )NzThe shape of the mask 
 at index z0 does not match the shape of the indexed tensor r   r7   )r   rx   jr/  r~   r7   r8   rW     s   r   c                      s   dj  dt  dS )Nr   z (got rb   )r   r   r7   )r   r~   r7   r8   rW     rF   r   Fr   Tc                    sL      }t |  }dgt |tt| jt  < | ||S )zI
        This follows restride_src in TensorAdvancedIndexing.cpp
        r   )r   r   r   r}   r   )r~   r}   r   )after_shapebefore_shapereplacement_shaper7   r8   _restride_src   s     z(meta_index_Tensor.<locals>._restride_srcguard_size_oblivious) rL   rY   r  	enumeraterS   ry   r   r  r  r   rz   r   r   r}   r   selecttorch._refsr   r   r)   r   r|   r   r(  r{   rG   Z3compute_elementwise_output_logical_to_physical_permZ
apply_permr   Zinvert_permr   r   )r~   r   r  r  refsstateZhas_contiguous_subspacern  Ztransposed_indicesrn   r&  r   r(  Zrestrided_selfpermZ
perm_shaper   r7   )	r#  r$  r   rx   r   r"  r/  r%  r~   r8   meta_index_Tensor  s    














r/  c                 C   sT   d }d }d }|
d r"|  | }|
d r8|  | }|
d rJ|  |}|||fS )Nr   r   r   r|   r   )grad_output_input_weight_Zbias_sizes_optr   r  rs  Z
transposedrv  ru  output_maskZbackend_grad_inputZbackend_grad_weightZbackend_grad_biasr7   r7   r8   meta_convolution_backward!  s    
r5  c                   s     d} d}| ||f} t  dkdd  t dkdd  t  d dk fdd t  d dk fd	d t|  d|ko|  d|kd
d  | |   S )Nr   r   r0   c                   S   s   dS rM  r7   r7   r7   r7   r8   rW   E  rF   zmeta_addbmm.<locals>.<lambda>c                   S   s   dS rN  r7   r7   r7   r7   r8   rW   F  rF   r   c                      s   d  d d d S )Nz8batch1 and batch2 must have same number of batches, got r   r   r   r7   rO  rP  r7   r8   rW   I  rF   c                
      s6   d  d d  d d d d d d	S )Nz#Incompatible matrix sizes for bmm (r   rB   r   r   rb   r   r7   r6  r7   r8   rW   M  s
    c                   S   s   dS )Nz.self tensor does not match matmul output shaper7   r7   r7   r7   r8   rW   T  rF   )r   r  rL   rY   rn   r|   )r~   rO  rP  r   r  rX  rY  r7   r6  r8   meta_addbmm?  s$    

r7  c                 K   s   |  |  S r2   r0  )r~   r   kwargsr7   r7   r8   meta_randint_likeY  s    r9  )
grad_scale	found_infc       	            s4   | |||||fD ] t t t fdd qd S )Nc                      s   dt   S Nz'exponent must be a tensor list but got rc   r7   lr7   r8   rW   t  rF   z#meta__fused_adam_.<locals>.<lambda>rL   rY   r\   r   )r~   gradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepslrbeta1beta2weight_decayepsamsgradmaximizer:  r;  r7   r>  r8   meta__fused_adam_^  s
    
rM  c       	            sZ   | |||||fD ] t t t fdd qdd }|| ||||||||fS )Nc                      s   dt   S r<  r=  r7   r>  r7   r8   rW     rF   z"meta__fused_adam.<locals>.<lambda>c                 S   s   dd | D S )Nc                 S   s   g | ]}t |qS r7   rb  )rA   r~  r7   r7   r8   rE     rF   z=meta__fused_adam.<locals>.empty_like_list.<locals>.<listcomp>r7   )Ztensor_listr7   r7   r8   empty_like_list  s    z)meta__fused_adam.<locals>.empty_like_listr@  )r~   rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  r:  r;  rN  r7   r>  r8   meta__fused_adamx  s    
rO  c                    s   t   dkdd  t  dkdd  t  jt ju  fdd t jt ju fdd t  ddk fd	d  j ddft jd
S )Nr   c                   S   s   dS )Nza must be a 2D tensorr7   r7   r7   r7   r8   rW     rF   zmeta__int_mm.<locals>.<lambda>c                   S   s   dS )Nzb must be a 2D tensorr7   r7   r7   r7   r8   rW     rF   c                      s   d j  S )Nzexpected self to be int8, got rv   r7   )r   r7   r8   rW     rF   c                      s   d j  S )Nzexpected mat2 to be int8, got rv   r7   )rj  r7   r8   rW     rF   r   r   c                
      s6   d  d d  d d d d d d	S )Nz'Incompatible matrix sizes for _int_mm (r   rB   r   r   rb   r   r7   ri  r7   r8   rW     s
    rv   )rL   rY   rn   rS   r  r   r|   r  ri  r7   ri  r8   meta__int_mm  s    



rP  c                    st   t   dkdd  t  jt ju  fdd  d} dd } j|d ||d  d	|d ft jd
S )Nr   c                   S   s   dS Nzw must be a 2D tensorr7   r7   r7   r7   r8   rW     rF   z2meta__convert_weight_to_int4pack.<locals>.<lambda>c                      s   d j  S Nr  rv   r7   r  r7   r8   rW     rF   r   r      r*      rv   )rL   rY   rn   rS   r  r   r|   r  r  Zinner_k_tilesr   r/  r7   rS  r8    meta__convert_weight_to_int4pack  s    



rW  c                    s`   t   dkdd  t  jt ju  fdd  d} d} j||d ft jdS )Nr   c                   S   s   dS rQ  r7   r7   r7   r7   r8   rW     rF   z:meta__convert_weight_to_int4pack_for_cpu.<locals>.<lambda>c                      s   d j  S Nzexpected w to be int32, got rv   r7   rS  r7   r8   rW     rF   r   r   rv   )rL   rY   rn   rS   r  r   r|   r  rV  r7   rS  r8   (meta__convert_weight_to_int4pack_for_cpu  s    




rY  c                    s   t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	dd	 jd
S )Nr   c                   S   s   dS Nzx must be a 2D tensorr7   r7   r7   r7   r8   rW     rF   z*meta__weight_int4pack_mm.<locals>.<lambda>r,  c                   S   s   dS )Nzw must be a 4D tensorr7   r7   r7   r7   r8   rW     rF   c                      s   d j  S Nr  rv   r7   r   r7   r8   rW     rF   c                      s   d j  S rX  rv   r7   rS  r7   r8   rW     rF   r   rT  rv   
rL   rY   rn   rS   r+  r,  r-  r  r|   r   r  r7   r  rB   r8   meta__weight_int4pack_mm  s    


r^  c                    s   t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	djdS )	Nr   c                   S   s   dS rZ  r7   r7   r7   r7   r8   rW     rF   z2meta__weight_int4pack_mm_for_cpu.<locals>.<lambda>c                   S   s   dS rQ  r7   r7   r7   r7   r8   rW     rF   c                      s   d j  S r[  rv   r7   r   r7   r8   rW     rF   c                      s   d j  S rR  rv   r7   rS  r7   r8   rW     rF   r   rv   )
rL   rY   rn   rS   r+  r,  r-  r  r|   r   r  r7   r]  r8    meta__weight_int4pack_mm_for_cpu  s    


r_  c                    s   t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	djdS )	Nr   c                   S   s   dS rZ  r7   r7   r7   r7   r8   rW     rF   z;_weight_int4pack_mm_with_scales_and_zeros.<locals>.<lambda>c                   S   s   dS rQ  r7   r7   r7   r7   r8   rW     rF   c                      s   d j  S r[  rv   r7   r   r7   r8   rW     rF   c                      s   d j  S rX  rv   r7   rS  r7   r8   rW     rF   r   rv   r\  )rB   r  r  ZqScaleZqZerosr7   r]  r8   )_weight_int4pack_mm_with_scales_and_zeros  s    


r`  )r   rj  r1   c                 C   s   | | d | | S r  r7   ri  r7   r7   r8   kai_roundup  s    ra  c           	         s   | dkr||kr^d}d}d}d
dddd 
fddfd	d
}||||||S |d dkr|| dkrd}d}d}d
ddd  fdd} 	
fdddd  fdd fdd	|||||||S d S )Nr,  rT  r*  r   c                 S   s   t || d}t | |S )Nr,  ra  )r/  krsrZkr_sr_roundedup4r7   r7   r8   kai_k_roundedup  s    z3get_kai_packed_weight_size.<locals>.kai_k_roundedupc                    s8    | ||}|d dks J d||d     S )Nr   r   zk_internal must be evenr7   )r/  nrrc  rd  Z
k_internal)re  kai_num_bytes_biaskai_num_bytes_multiplier_rhskai_num_bytes_sum_rhsr7   r8   9kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0  s    z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0c                    s    t | || }| |||| S r2   rb  )r   r/  rf  rc  rd  num_rows)rj  r7   r8   7kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0'  s    z[get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0rU  r   c                    sR   || dksJ | dks J |  dks0J t | || }|||||| S r9  rb  )r   r/  rf  rc  rd  blrk  )kai_bl_multiple_of;kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0kai_nr_multiple_ofr7   r8   9kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0?  s    
z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                    s^   || dksJ | dks J |  dks0J  }| |}||}|||    S r9  r7   )r/  rf  rc  rd  rm  num_bytes_multiplier_rhsZnum_blocks_per_rowZnum_bytes_per_block)rn  #kai_get_bf16_datatype_size_in_bytesrp  kai_num_blocks_per_rowrg  kai_num_bytes_per_blockri  r7   r8   ro  O  s    
z_get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                   S   s   dS )Nr   r7   r7   r7   r7   r8   rs  e  s    zGget_kai_packed_weight_size.<locals>.kai_get_bf16_datatype_size_in_bytesc                    s   |  dksJ t | || S r9  rb  )r/  rm  rn  r7   r8   rt  h  s    z:get_kai_packed_weight_size.<locals>.kai_num_blocks_per_rowc                    s   |   dksJ | d | S )Nr   r   r7   )rm  rr  rv  r7   r8   ru  l  s    z;get_kai_packed_weight_size.<locals>.kai_num_bytes_per_blockr7   )	Zn_bitsrh  KZ	groupsizeZkai_nrZkai_krZkai_srrl  rq  r7   )rn  rs  ro  rj  re  rp  rt  rg  rh  ru  ri  r8   get_kai_packed_weight_size  s<    
rx  )r  c                    s   t  jt ju  fdd t jj r||kr<|jt jksh||k r|d dkr|| dkr|jt jkrt	d|||} j
t|t jdS   |  } j
|t jdS )Nc                      s   d j  S rR  rv   r7   weightsr7   r8   rW   {  rF   z2meta__dyn_quant_pack_4bit_weight.<locals>.<lambda>rU  r   r,  rv   )rL   rY   rS   r  backendsZkleidiaiis_availablerO   r-  rx  r|   r   r{   )rz  Zscales_zerosr  
block_sizein_featuresout_featuresZpacked_weight_sizer7   ry  r8    meta__dyn_quant_pack_4bit_weightu  s.    





	r  c                    sR   t   dkdd  t  jt jfv  fdd  d} j|| jdS )Nr   c                   S   s   dS )Nzinput must be a 2D tensorr7   r7   r7   r7   r8   rW     rF   z-meta__dyn_quant_matmul_4bit.<locals>.<lambda>c                      s   d j  S )Nzexpected input to be f32, got rv   r7   inpr7   r8   rW     rF   r   rv   )rL   rY   rn   rS   r+  r   r|   )r  Zpacked_weightsr}  r~  r  r  r7   r  r8   meta__dyn_quant_matmul_4bit  s    

r  c                    s   t  dkdd  t jt jt jt jfv fdd t   dkdd  t  jt ju  fdd j	d 	djdS )	Nr   c                   S   s   dS rZ  r7   r7   r7   r7   r8   rW     rF   z*meta__weight_int8pack_mm.<locals>.<lambda>c                      s   d j  S r[  rv   r7   r   r7   r8   rW     rF   c                   S   s   dS rQ  r7   r7   r7   r7   r8   rW     rF   c                      s   d j  S )Nzexpected w to be int8, got rv   r7   rS  r7   r8   rW     rF   r   rv   )
rL   rY   rn   rS   r+  r,  r-  r  r|   r   )rB   r  Zq_scalesr7   r]  r8   meta__weight_int8pack_mm  s    


r  c           	         s  t  dkfdd t  dkfdd t ddkfdd t tjdd  t tjdd  t |d	kd
d  t  dv  fdd d}d}jd d }jd d }tt 	||}|
||g |S )Nr   c                      s   d    dS )Nz1cdist only supports at least 2D tensors, X1 got: r  r   r7   )x1r7   r8   rW     rF   z$meta_cdist_forward.<locals>.<lambda>c                      s   d    dS )Nz1cdist only supports at least 2D tensors, X2 got: r  r   r7   )x2r7   r8   rW     rF   r   c                      s   d  d d d S )Nz4X1 and X2 must have the same number of columns. X1: r   z X2: r   r7   )r  r  r7   r8   rW     rF   c                   S   s   dS )Nz=cdist only supports floating-point dtypes, X1 got: {x1.dtype}r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz=cdist only supports floating-point dtypes, X2 got: {x2.dtype}r7   r7   r7   r7   r8   rW     rF   r   c                   S   s   dS )Nz)cdist only supports non-negative p valuesr7   r7   r7   r7   r8   rW     rF   Nr   r   c                      s
   d  S )Nz%possible modes: None, 1, 2, but was: r7   r7   )compute_moder7   r8   rW     rF   rr  )rL   rY   rn   r   rG   is_float_dtyperS   r}   r   broadcast_shapesextendr|   )	r  r  rH  r  r1r2batch_tensor1batch_tensor2r1  r7   )r  r  r  r8   meta_cdist_forward  s@    








r  c                 C   s   |j d }|j d }|j d }|j d d }|j d d }	tt||	}
|
 }|||g t|
}|dks|dks|dks|dkrt|S |t|j kr|	|}tj
|tjdS )Nr   rr  r   r   )r}   r   rL   r  copyr  mathprod
zeros_liker  r   r   )rJ  r  r  rH  ZcdistZc1r  r  r  r  r  Ztensor1_expand_sizeZbatch_productr7   r7   r8   meta_cdist_backward  s    



 

r  c	                    s  t  jt jt jfv  fdd t jt jt jfv fdd t tjfdd d}	|rt |	dkdd  |	d8 }	|	d}
d urt |t	kdd  t j
dkfd	d t    k fd
d fdddd fdd}tdkrl  d}  }|tkr` |	d}n
 d}n||
|}|ttfv s|s d}n
d}|	}jd }|tkr|rt |dkdd  |d8 }|jd }n| }|
|||fS )Nc                      s   d j  S )Nz(expected indices to be long or int, got rv   r7   )r   r7   r8   rW     rF   z$meta_embedding_bag.<locals>.<lambda>c                      s   d j  S )Nz(expected offsets to be long or int, got rv   r7   )r=  r7   r8   rW     rF   c                      s   d j  S )Nz/expected weight to be floating point type, got rv   r7   )r  r7   r8   rW     rF   r   r   c                   S   s   dS Nz1include_last_offset: numBags should be at least 1r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz@embedding_bag: per_sample_weights only supported with mode='sum'r7   r7   r7   r7   r8   rW     rF   c                      s   d j  dS )Nz1expected per_sample_weights to be 1D tensor, got r  r  r7   )per_sample_weightsr7   r8   rW     rF   c                      s   d   d    dS )Nz%expected per_sample_weights.numel() (z$ to be the same as indices.numel() (rb   r{   r7   )r   r  r7   r8   rW     s    c                    s    | ||o| ddkS Nr   r   r   r	  rd  r   padding_idx)is_fast_path_index_selectr7   r8   is_fast_path_index_select_scale  s    z;meta_embedding_bag.<locals>.is_fast_path_index_select_scalec                 S   s<   | j tjks| j tjko:| ddko:|ddko:|dk S Nr   r   )rS   rL   rO   rM   r   )r	  r   r  r7   r7   r8   r  "  s    z5meta_embedding_bag.<locals>.is_fast_path_index_selectc                    s&   |d ur| |||S  | ||S d S r2   r7   r  )r  r  r7   r8   is_fast_path*  s    z(meta_embedding_bag.<locals>.is_fast_pathcpuc                   S   s   dS r  r7   r7   r7   r7   r8   rW   D  rF   )rL   rY   rS   ry   r   rG   r  r   r|   MODE_SUMr   r{   r   MODE_MAX	MODE_MEANr}   )r  r   r=  scale_grad_by_freqr  sparser  Zinclude_last_offsetr  Znum_bagsr   r  
offset2bagbag_sizemax_indicesZfast_path_sumZnumBagsr7   )r   r  r  r=  r  r  r8   meta_embedding_bag  st    










r  c                 G   sB   t | ||g|R  \}}}}t|dkr6|| }||||fS )Nr  )r  r   r|   r   )r  r   r=  rI   r   r  r  r  r7   r7   r8   meta_embedding_bag_forward_onlyM  s    r  c                 C   s.   |r|S | j js| j jr| j S |r(tjS | j S r2   )rS   r   r   rL   ry   )r   rS   promote_int_to_longr7   r7   r8   _get_reduction_dtypeW  s    r  rv   c                C   s6   t | |dd}t| j|}t| ||}| j||dS )NT)r  rv   )r  rG   rM  r}   rN  r|   )r   rn  rP  rS   r  r1  r7   r7   r8   meta_nansumd  s    r  c                 C   s$   t | jtt|  }| |S r2   )rG   ro  r}   rX   r   rn   r|   )r   r1  r7   r7   r8   meta_medianm  s    r  c                 C   sL   t | dkrtd t| j|f}t| ||}| || j|tjdfS )Nr   zmedian CUDA with indices outputrv   )	r   rG   alert_not_deterministicrM  r}   rN  r|   rL   ry   )r   rn   rP  r1  r7   r7   r8   meta_median_mode_dimu  s    
r  c                 C   s   | S r2   r7   r   r7   r7   r8   meta_logical_not_  s    r  c                    s   t t|  kdd  tD ]"\ t dk fdd q$t|   }d| t| j fddttD }| |S )Nc                   S   s   dS )NzZNumber of dimensions of repeat dims can not be smaller than number of dimensions of tensorr7   r7   r7   r7   r8   rW     rF   zmeta_repeat.<locals>.<lambda>r   c                      s   d d  S )Nz"Repeats cannot be negative, found r!  r7   r7   )r   repr7   r8   rW     rF   r  c                    s   g | ]} | |  qS r7   r7   rl  )padded_sizer  r7   r8   rE     rF   zmeta_repeat.<locals>.<listcomp>)	rL   rY   r   rn   r)  rX   r}   r   r|   )r~   r  Znum_new_dimensionsZtarget_sizer7   )r   r  r  r  r8   meta_repeat  s    r  c                 C   s   | S r2   r7   r   r7   r7   r8   
meta_zero_  s    r  c                 C   s   t |tjrt| j|j | S r2   )r\   rL   r   rZ   r}   r~   r   r7   r7   r8   meta_binop_inplace  s    r  c                 C   sf   dd }dd }dd }|| r0||r0t d|| rH||sHt dt|tjrbt| j|j | S )	a*  
    Some checks for inplace ops.
    Checks for promotion rules for some dtypes.
    int.add/sub_(float) and bool.add/sub_(others) are rejected.
    Promoting in these in-place operations would require reallocating
    and copying over elements, hence not allowed.
    Checks for alpha param.
    c                 S   s$   t | trt| jS t | tS d S r2   )r\   r"   rG   r  rS   r   r^   r7   r7   r8   is_integeric  s    
z.meta_binop_inplace_alpha.<locals>.is_integericc                 S   s$   t | trt| jS t | tS d S r2   )r\   r"   rG   r  rS   r   r  r7   r7   r8   
is_floatic  s    
z,meta_binop_inplace_alpha.<locals>.is_floaticc                 S   s$   t | trt| jS t | tS d S r2   )r\   r"   rG   Zis_boolean_dtyperS   r   r  r7   r7   r8   is_booleanic  s    
z.meta_binop_inplace_alpha.<locals>.is_booleanicz]Promotion of int.add/sub_(float) in in-place ops are not possible due to element size change.z_Promotion of book.add/sub_(others) in in-place ops are not possible due to element size change.)r  r\   rL   r   rZ   r}   )r~   r   r  r  r  r  r7   r7   r8   meta_binop_inplace_alpha  s    r  c                 K   s   t | tjdS Nr=   rK   r   rH   )r~   r8  r7   r7   r8   
meta_round  s    r  c                    sj   t tj fdd tt jrJt tj fdd nt tt fdd d S )Nc                      s     dj  S )Nz7: Expected input tensor to have an integral dtype. Got rv   r7   )r  r~   r7   r8   rW     rF   z#shift_dtype_check.<locals>.<lambda>c                      s     dj  S )Nz6: Expected shift value to have an integral dtype. Got rv   r7   r  r[  r7   r8   rW     rF   c                      s     d S )Nz): Expected shift value to be an int. Got r7   r7   r  r7   r8   rW     rF   )rL   rY   rG   r  rS   r\   r   r   r  r~   r[  r7   r  r8   shift_dtype_check  s    

r  c                 C   s   t d| | t| |tjdS )Nrshiftr=   r  rK   r   rH   r  r7   r7   r8   meta_rshifts  s    r  c                 C   s   t d| | t| |tjdS )Nlshiftr=   r  r  r7   r7   r8   meta_lshifts  s    r  c                 C   s   |  | jS r2   r  r   r7   r7   r8   	meta_zero  s    r  c                 C   s   | S r2   r7   r~   r[  r7   r7   r8   
meta_fill_  s    r  c                 C   s
   t | S r2   rb  r  r7   r7   r8   	meta_fill!  s    r  c                 C   s   | S r2   r7   r   r7   r7   r8   
meta_relu_&  s    r  c                 C   s   t | |tjdS r  r  )r~   r   r  r7   r7   r8   meta__add_relu+  s    r        ?UUUUUU?c                 C   s
   t | S r2   rb  r~   Znoiselowerr  r  r   r7   r7   r8   meta_rrelu_with_noise3  s    r  c                 C   s   t | t |fS r2   rb  r  r7   r7   r8    meta_rrelu_with_noise_functional;  s    r  c                 C   s   | S r2   r7   )r~   r  r  r  r   r7   r7   r8   meta_rrelu_with_noise_B  s    r  c                 C   s
   t | S r2   rb  r~   r   r   
accumulater7   r7   r8   meta_index_putI  s    r  c                 C   s   t | j|j | S r2   rZ   r}   )r~   re  valuer7   r7   r8   meta_masked_fill_N  s    r  c                 C   s    |  |  jt| d}|S r   )r|   r   r  rG   r!   )r~   re  rd  Zmasked_scaler7   r7   r8   meta__masked_scaleT  s    r  c                    s@   t |jt jt jfv dd  t  jjk fdd  S )Nc                   S   s   dS )NzMask must be bool or uint8r7   r7   r7   r7   r8   rW   _  rF   z&meta_masked_scatter_.<locals>.<lambda>c                      s   d j  dj  S )NzEmasked_scatter: expected self and source to have same dtypes but got r   rv   r7   r~   r4  r7   r8   rW   c  s   )rL   rY   rS   r  r  )r~   re  r4  r7   r  r8   meta_masked_scatter_\  s    
r  c                 C   s*   t | |\} }tj| tjd}t|||S r   )r)   rL   r   r   r  )r~   re  r4  r   r7   r7   r8   meta_masked_scatteri  s    r  c                 C   s
   |  |S r2   rI  )r~   re  r  r7   r7   r8   meta_masked_scatter_backwardq  s    r  c                 C   s   | S r2   r7   r  r7   r7   r8   meta_index_put_v  s    r  c                 C   s   |  | jS r2   )viewr}   r   r7   r7   r8   
meta_alias{  s    r  c           
         s<  t |  dkdd  t | dkdd  |  }|  |d |d |d } d }||ft  d ko d k fdd |r| jt jks| jt jko|t jk}t || jkp|d	d  |	|}	n
|}	|s8d ur8t  dkd
d  t  kfdd |	S )Nr0   c                   S   s   dS rM  r7   r7   r7   r7   r8   rW     rF   z)common_meta_baddbmm_bmm.<locals>.<lambda>c                   S   s   dS rN  r7   r7   r7   r7   r8   rW     rF   r   r   r   c                	      s&   d d d d  d d  d	S rQ  r7   r7   rS  r7   r8   rW     s   c                   S   s   dS )Nzfout_dtype only supported for torch.float32 output with float16/bfloat16 inputs or same as input dtypesr7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nzself must be a 3D tensorr7   r7   r7   r7   r8   rW     rF   c                      s   d  d   S )Nz*Expected an input tensor shape with shape z but got shape: r   r7   )r  self_baddbmmr7   r8   rW     rF   )
rL   rY   rn   r   rS   r,  r-  r+  r|   r  )
rO  rP  Zis_bmmr  r  rZ  Zres_rowsZres_colsZsupported_out_dtyper   r7   )rT  rU  rV  r  r  r8   common_meta_baddbmm_bmm  s>    


r  c                 C   s   t | |dS )NTr  )r~   r  r7   r7   r8   meta_bmm  s    r  c                 C   s   t | |d|dS )NT)r  r  )r~   r  r  r7   r7   r8   meta_bmm_dtype  s    r  c                 C   s<   | | }| | }|dkr8t |dk t |dk kr8|d8 }|S r  )r  )rB   yqr  r7   r7   r8   div_rtn  s
     r  c                 C   sZ   t | | | ||d   d |r(|d nd |d }|rV|d | | | krV|d8 }|S r  )r  )	inputSize
kernelSizer  r  r   rs  r  Z
outputSizer7   r7   r8   pooling_output_shape_pad_lr  s*    

	r  c                    sl   t |dkdd  t dkfdd t d   d d k fdd t| | |S )Nr   c                   S   s   dS )Nzstride should not be zeror7   r7   r7   r7   r8   rW     rF   z&pooling_output_shape.<locals>.<lambda>c                      s
   d  S )Nz'pad must be non-negative, but got pad: r7   r7   padr7   r8   rW     rF   r   r   c                      s   d d d  S )NzApad should be at most half of effective kernel size, but got pad=z, kernel_size=z and dilation=r7   r7   rs  r  r  r7   r8   rW     s
    )rL   rY   r  )r  r  r  r   rs  r  r7   r  r8   r    s    r  c              	      sR     }tdkodkdd  t|dko:|dkdd  t|dkoV|dkdd   ddkoz ddk}|tjkrt|dko|o d	dkd
d  nDt|d	kr̈ ddkr|p|dko|o d	dk fdd td 
kod 	k	
fdd tdko6dkfdd d S )Nr   c                   S   s   dS )NzCkernel size should be greater than zero, but got kH: {kH}, kW: {kW}r7   r7   r7   r7   r8   rW     rF   z$pool2d_shape_check.<locals>.<lambda>c                   S   s   dS )Nz>stride should be greater than zero, but got dH: {dH}, dW: {dW}r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz\dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}r7   r7   r7   r7   r8   rW     rF   r   r   r,  r0   c                   S   s   dS )NzExpected 4D (batch mode) tensor expected for input with channels_last layout with optional 0 dim batch size for input, but got: {input.size()}r7   r7   r7   r7   r8   rW     rF   c                      s   d    S )NzYExpected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: r   r7   r  r7   r8   rW     rF   c                      s   d d d d  S )NzKpad should be smaller than or equal to half of kernel size, but got padW = z	, padH = z, kW = z, kH = r7   r7   )r  r  r  r  r7   r8   rW     s   c                      s*   d d  d d d d dS NzGiven input size: (rB   z). Calculated output size: (z). Output size is too smallr7   r7   )r  r  r  r  r  r  r7   r8   rW     s   )rn   rL   rY   r   r  )r   r  r  r  r  r  r  	dilationH	dilationWr  r  r  r  r  r   r   Z
valid_dimsr7   )r   r  r  r  r  r  r  r  r  r  r  r8   r    sB    

r  )r   r  r  r  r  r  r  r  pTpHpW	dilationTr  r  r  r  r  r  r  r  r  r  c              
      s  	j }tdko dko dkfdd tdkoLdkoL dk fdd tdkoxdkoxdkfdd t|dv 	fdd t|D ]8|dkrdkrqt	dk	fd	d q|r"t
ko
ko
k
fd
d td koLd koLd kfdd tdkodkodk
fdd d S )Nr   c                      s   d d  d S )Nz5kernel size should be greater than zero, but got kT: z, kH: z, kW: r7   r7   )r  r  r  r7   r8   rW   A  s    z$pool3d_shape_check.<locals>.<lambda>c                      s   d d  d S )Nz0stride should be greater than zero, but got dT: z, dH: z, dW: r7   r7   )r  r  r  r7   r8   rW   H  s    c                      s   d d  d S )Nz9dilation should be greater than zero, but got dilationT: z, dilationH: z, dilationW: r7   r7   )r  r  r  r7   r8   rW   N  s    r  c                      s     dj  S )Nz/: Expected 4D or 5D tensor for input, but got: r   r7   )r  r   r7   r8   rW   V  rF   r<  c                      s     dj  d dS )NzZ: Expected input's non-batch dimensions to have positive length, but input has a shape of z and non-batch dimension z has length zero!)r}   r   r7   )r  r   r   r7   r8   rW   _  s
    c                      s*   d d  d d d d dS )Nzinput image (T: r@  r2  z ) smaller than kernel size (kT:  kH:  kW: rb   r7   r7   )r  r  r  r  r  r  r7   r8   rW   i  s    r   c                      s(   d d d  d d d S )NzHpad should be smaller than or equal to half of kernel size, but got kT: r  r  z padT: z padW: z padH: r7   r7   )r  r  r  r  r  r  r7   r8   rW   q  s    r   c                      s6   d d d  d d d d d dS r  r7   r7   )r  r  r  r  r  r  r  r7   r8   rW   y  s    )r   rL   rY   r   r   )r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r7   )r  r  r  r  r  r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r8   r  %  sJ    	&r  c                 C   s   | j }t| |||||||	|
|||||||||||| t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | d S )Nr,  r0   r   r   r   r  r  )r   r$  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r7   r7   r8   max_pool3d_backward_shape_check  s@    r  )r   r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  c                 C   s   | j }t| ||||||||	|
|ddd|||||||d t|||d | t|||d | t|||d | t|||d | d S )Nr   Tr,  r0   r   r  )r   r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r7   r7   r8   r    s:    r  c                 C   sB  dd }|d|\}}t t|dv dd  t|dkrF|| }	}
n|d|\}	}
|d	|\}}|d
|\}}| d}| d}| d}t| }|t jkrt |  dkdd  n4|t jkrt |  dv dd  nt ddd  t	||||	||}t	||||
||}t
| |||	|
|||||||||| |||fS )Nc                    sD   t t|dv  fdd |d }t|dkr4|n|d }||fS )Nr  c                      s   d  dS )Nzmax_pool2d: r  r7   r7   r  r7   r8   rW     rF   zEmax_pool2d_checks_and_compute_shape.<locals>.unpack.<locals>.<lambda>r   r   r  r  r7   r  r8   r    s    

z3max_pool2d_checks_and_compute_shape.<locals>.unpackr|  r  c                   S   s   dS )NzOmax_pool2d: stride must either be omitted, a single int, or a tuple of two intsr7   r7   r7   r7   r8   rW     rF   z5max_pool2d_checks_and_compute_shape.<locals>.<lambda>r   r   r  rs  r  rr  r   r,  c                   S   s   dS )NzMnon-empty 4D (batch mode) tensor expected for input with channels_last layoutr7   r7   r7   r7   r8   rW     rF   r  c                   S   s   dS )Nz9non-empty 3D or 4D (batch mode) tensor expected for inputr7   r7   r7   r7   r8   rW   !  rF   Fc                   S   s   dS )Nz?Unsupport memory format. Supports only ChannelsLast, Contiguousr7   r7   r7   r7   r8   rW   &  rF   )rL   rY   r   r   rG   r!   r  rn   r   r  r  )r   r|  r   r  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r7   r7   r8   r    sb    		








r  c                    s   t |||||\}tj jk fdd |jfdd}	|	  |	| t}
tjjjj	|
dS )Nc                      s   dj  d j  S )NzExpected dtype z  for `gradOutput` but got dtype rv   r7   r  r7   r8   rW   V  rF   z7meta_max_pool2d_with_indices_backward.<locals>.<lambda>c                    s:   t | d   t | d  t | d  d S )Nr0   r   r   )r  )r~  )r  r   r  r  r7   r8   _check_dim_size\  s    z>meta_max_pool2d_with_indices_backward.<locals>._check_dim_sizerB  )
r  rL   rY   rS   r   rG   r!   rs   r}   rk   )r$  r~   r|  r   r  rs  r  r   r  r  r   r7   )r$  r  r   r  r  r~   r8   %meta_max_pool2d_with_indices_backwardA  s.    

r  c                 C   s   t | |||||\}}}|  dkr.| dnd}	t| }
|  dkrT|||g}n|	|||g}tj|| j| j|
dtj|tj	| j|
dfS r  )
r  rn   r   rG   r!   rL   rs   rS   rk   r   r  r7   r7   r8   meta_max_pool2d_with_indicesm  s2    
r  c           
   	      s   t jdv fdd j}t|d |D ], t  dkd  d  d q.t td	kd
d  t t|d	kdd  d}dd|dkrd}nd}t jjkdd  t jdkfdd d}d}d	 t ||kd t ||kdd  t  d	k fdd t |d d  d kfdd t |d d  d kfdd  dkr|||d |d g}	n||d |d g}	t j|	jj	dt j|	t j
j	dfS )Nr  c                      s   d j  S )Nz:fractional_max_pool2d: Expected 3D or 4D tensor, but got: r  r7   r   r7   r8   rW     rF   z,meta_fractional_max_pool2d.<locals>.<lambda>r0   r   z^fractional_max_pool2d: Expected input to have non-zero  size for non-batch dimenions, but got r   z emptyr   c                   S   s   dS )NzNfractional_max_pool2d: kernel_size musteither be a single int or tuple of Intsr7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )NzOfractional_max_pool2d: output_size must either be a single int or tuple of Intsr7   r7   r7   r7   r8   rW     rF   r  rr  r   r,  r   c                   S   s   dS )Nz6Expect _random_samples to have the same dtype as inputr7   r7   r7   r7   r8   rW     rF   c                      s   d j  S )Nz1Expect _random samples to have 3 dimensions got, r  r7   )random_samplesr7   r8   rW     rF   z=Expect _random_samples.size(0) no less then input batch size.c                   S   s   dS )Nz<Expect _random_samples.size(1) equals to input channel size.r7   r7   r7   r7   r8   rW     rF   c                      s   d  dS )Nz/Expect _random_samples.size(2) equals to 2 got .r7   r7   )r   r7   r8   rW     rF   c                      s   dd  d  S )Nz%fractional_max_pool2d: kernel height r   z' is too large relative to input height r7   r7   )input_heightr|  r7   r8   rW     rF   c                      s   dd  d  S )Nz$fractional_max_pool2d: kernel width r   z& is too large relative to input width r7   r7   )input_widthr|  r7   r8   rW     rF   r  )rL   rY   r   r   r   r   rS   rn   rs   rk   r   )
r~   r|  r  r   r   Zinput_channelsZinput_batchr   cr   r7   )r   r  r  r|  r   r~   r8   meta_fractional_max_pool2d  s    










r  c                 C   s  t t|dv dd  |d }t|dkr0|n|d }t|dkrH|n|d }t | pdt|dv dd  |sv|n|d }	|s|nt|dkr|	n|d }
|s|nt|dkr|	n|d }t t|dv dd  |d }t|dkr|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkrB|n|d }t|dkr\|n|d }t | jd
v dd  | jdkr| dnd}| d}| d}| d}| d}t||||	||}t||||
||}t||||||}t| |||||	|
|||||||||||||d | jdko<t| t j	k}| jdkr|| 
d}|  ol|jt j	d}||||f}n|||||f}| |}| j|t jd}|r|jt j	d}|jt j	d}||fS )Nr  c                   S   s   dS NzMmax_pool3d: kernel_size must either be a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW     rF   z.meta_max_pool3d_with_indices.<locals>.<lambda>r   r   r   c                   S   s   dS NzQmax_pool3d: stride must either be omitted, a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW     rF   c                   S   s   dS NzImax_pool3d: padding must either be a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW     rF   c                   S   s   dS NzJmax_pool3d: dilation must be either a single int, or a tuple of three intsr7   r7   r7   r7   r8   rW     rF   r  c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   r<  r  r  rr  r   zmax_pool3d_with_indices()r,  r   rv   )rL   rY   r   r   r   r  r  rG   r!   r  r   r   r|   r   r  )r   r|  r   r  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  input_channels_last_checkr   r   r   r7   r7   r8   meta_max_pool3d_with_indices  s    

  







r  c                 C   sd  t t|dv dd  |d }t|dkr0|n|d }	t|dkrH|n|d }
t | pdt|dv dd  |sv|n|d }|s|	nt|dkr|n|d }|s|
nt|dkr|n|d }t t|dv dd  |d }t|dkr|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkrB|n|d }t|dkr\|n|d }t |jd
v dd  |d}|d}|d}|d}| d}| d}| d}t|| ||||	|
|||||||||||||||d |jdkot|t jk}|jdkr@|	d}|
  o>|j
t jd}||j}|r`|jt jd}|S )Nr  c                   S   s   dS r  r7   r7   r7   r7   r8   rW   `  rF   z7meta_max_pool3d_with_indices_backward.<locals>.<lambda>r   r   r   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   h  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   p  rF   c                   S   s   dS r	  r7   r7   r7   r7   r8   rW   x  rF   r  c                   S   s   dS r  r7   r7   r7   r7   r8   rW     rF   r  r  rr  r   z"max_pool3d_with_indices_backward()r<  r,  r   )rL   rY   r   r   r   r  rG   r!   r  r   r   r|   r}   r  )r$  r   r|  r   r  rs  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r7   r7   r8   %meta_max_pool3d_with_indices_backwardR  s    
  









r  r   gridc                    s   t j jk fdd t jt jko8 jt jk fdd t jd  jd k fdd t  jd jd k fdd tdjD ]$t j dkfd	d qd S )
Nc                      s   dj  d j  S )NzNgrid_sampler(): expected input and grid to be on same device, but input is on z and grid is on ra  r7   r  r   r7   r8   rW     s
    z+check_grid_sampler_common.<locals>.<lambda>c                      s   dj  d j  S )NzTgrid_sampler(): expected input and grid to have torch.strided layout, but input has z and grid has )rj   r7   r  r7   r8   rW     s
    r   c                      s   dj  d j  S )NzZgrid_sampler(): expected grid and input to have same batch size, but got input with sizes  and grid with sizes r   r7   r  r7   r8   rW     s
    r   r   c                      s   dj d  d j S )Nz+grid_sampler(): expected grid to have size r   z, in last dimension, but got grid with sizes )r   r}   r7   r  r7   r8   rW     s    c                      s   dj  d  dS )NzYgrid_sampler(): expected input to have non-empty spatial dimensions, but input has sizes r   r  r   r7   r
  r7   r8   rW     s
    )rL   rY   rk   rj   r  r}   r   r   r  r7   )r  r   r   r8   check_grid_sampler_common  s*    
r  c                   @   s   e Zd ZdZdZdZdS )GridSamplerInterpolationr   r   r   N)rd   
__module____qualname__ZBILINEARZNEARESTBICUBICr7   r7   r7   r8   r    s   r  r   r  interpolation_modec                    sP   t jdkoj jk fdd t jdko@|tjjk dd  d S )Nr<  c                      s   dj  d j  S )Nzdgrid_sampler(): expected 5D input and grid with same number of dimensions, but got input with sizes r  r   r7   r  r7   r8   rW     s
    z'check_grid_sampler_3d.<locals>.<lambda>c                   S   s   dS )Nz<grid_sampler(): bicubic interpolation only supports 4D inputr7   r7   r7   r7   r8   rW     rF   )rL   rY   r   r  r  r  r  r7   r  r8   check_grid_sampler_3d  s    

r  c           
      C   s:   |d }|rt j|t jd}nd }t j|t jd}	||	fS Nr   r   )rL   r  r   r   
r$  r   r  r  padding_modealign_cornersr4  Zinput_requires_gradr'  	grad_gridr7   r7   r8   grid_sampler_2d_backward_meta  s    
r  c           
      C   s\   t | | t| || | jd }| jd }|jd }|jd }|jd }	| |||||	fS )Nr   r   r   r0   )r  r  r}   r|   )
r   r  r  r  r  rh  CZout_DZout_HZout_Wr7   r7   r8   grid_sampler_3d  s    	





r!  r  c           
      C   sP   t || t||| |d }|r4tj|tjd}nd }tj|tjd}	||	fS r  )r  r  rL   r  r  r   r  r7   r7   r8   grid_sampler_3d_backward  s    
r"  c                 O   s:   | dd }|st|}||d< tj| g|R i |S )NrS   )rR   rG   Z	get_dtyperL   rs   )r   r  rI   r8  rS   r7   r7   r8   full7  s
    
r#  c                 C   s   |t jkrt |d u dd  t jd|d u r2| jn|||d u rD| jn||d}| jrp||  | 	 | 
  n||  |  d |d |S tjj| |||||d}|d |S )Nc                   S   s   dS )Nz9memory format option is only supported by strided tensorsr7   r7   r7   r7   r8   rW   M  rF   zzeros_like.<locals>.<lambda>r   r   Tr`  )rL   Z
sparse_coorY   rs   rS   rk   	is_sparseZsparse_resize_and_clear_r   
sparse_dim	dense_dimrn   Z_coalesced_r.   r   r  fill_)r~   rS   rj   rk   rl   r   r  r7   r7   r8   r  A  s:    	

	
r  ri   c                C   sB   |d u rt  }|d u r t  }|d u r.t j}t j| ||||dS r   rL   rp   Zget_default_devicer  rs   r   rS   rj   rk   rl   rm   r7   r7   r8   	meta_onesn  s    
r*  c                C   sB   |d u rt  }|d u r t  }|d u r.t j}t j| ||||dS r   r(  r)  r7   r7   r8   
meta_zeros  s    
r+  c           	         s   ddl m}  }t|dkdd   dkr4 n |   }t| |kpb||k  fdd dkrn| t }t } |    }| = | = 	|||S )Nr   r'  c                   S   s   dS )Nz-select() cannot be applied to a 0-dim tensor.r7   r7   r7   r7   r8   rW     rF   zmeta_select.<locals>.<lambda>c                      s   d d   d  S )Nzselect(): index z! out of range for tensor of size z at dimension r   r7   rn   rx   r~   r7   r8   rW     s   )
r   r(  rn   rL   rz   r   r   r   r   r   )	r~   rn   rx   r(  r   r   Znew_sizer   Znew_storage_offsetr7   r,  r8   meta_select  s(    
r-  c                 C   s
   t | S r2   rG   Zclone_preserve_strides)r~   r	  rn   rx   r7   r7   r8   meta_select_scatter  s    r/  c                 C   s
   t | S r2   r.  )r~   r	  rn   rf   re   stepr7   r7   r8   meta_slice_scatter  s    r1  )rn   dim_post_exprwrap_scalarc                 C   sb   |dkr|sJ d}| }|d }| |k s2| |krNJ d|  d| d| d| dk r^| |7 } | S )Nr   r   zdim z out of bounds (ra   rb   r7   )rn   r2  r3  r   r   r7   r7   r8   r     s    ,r   c                 C   s   |   dkrdS | j| S r  r  )r~  rn   r7   r7   r8   ensure_nonempty_size  s    r4  c                    st   t  d}t  d}t||kdd  t|D ]6 kr8tttk fdd q8d S )Nr   c                   S   s   dS )NzDIndex tensor must have the same number of dimensions as input tensorr7   r7   r7   r7   r8   rW     rF   z$gather_shape_check.<locals>.<lambda>c                      s$   d dj  dj  d   S )Nz!Size does not match at dimension z expected index  to be no larger than self  apart from dimension r   r7   rn   r   rx   r~   r7   r8   rW     s   )r   rn   rL   rY   r   r4  )r~   rn   rx   	self_dimsZ
index_dimsr7   r7  r8   gather_shape_check  s    r9  c                    sn   ddl m} t||  }|  dk}|sbt jtjkpH jtj	k fdd t
| |  |  jS )Nr   r'  c                      s   d j  S )Nz8gather(): Expected dtype int32/int64 for index, but got rv   r7   rw   r7   r8   rW     rF   zmeta_gather.<locals>.<lambda>)r   r(  r   rn   r{   rL   rY   rS   ry   r   r9  r|   r}   )r~   rn   rx   Zsparse_gradr(  wrapped_dimZis_index_emptyr7   rw   r8   meta_gather  s    
r;  c                 C   s   |rT| dkrdS | dkrdS | dkr(dS | dkr4dS | d	kr@d
S t ddd  d S | dkr`dS | dkrldS t ddd  d S d S )Nr  Z
REDUCE_ADDr  ZREDUCE_MULTIPLYmeanZREDUCE_MEANZamaxZREDUCE_MAXIMUMZaminZREDUCE_MINIMUMFc                   S   s   dS )Nz=reduce argument must be either sum, prod, mean, amax or amin.r7   r7   r7   r7   r8   rW     rF   z#get_operator_enum.<locals>.<lambda>addmultiplyc                   S   s   dS )Nz/reduce argument must be either add or multiply.r7   r7   r7   r7   r8   rW     rF   r  )reduce_use_new_optionsr7   r7   r8   get_operator_enum  s,    rA  c                    sl   ddl m} || dkrDt|jtjkp6|jtjk fdd |d urht|j|jk fdd d S )Nr   r'  c                      s
     dS )Nz((): Expected dtype int32/int64 for indexr7   r7   method_namer7   r8   rW     rF   z,scatter_gather_dtype_check.<locals>.<lambda>c                      s
     dS )Nz0(): Expected self.dtype to be equal to src.dtyper7   r7   rB  r7   r8   rW     rF   )r   r(  r{   rL   rY   rS   ry   r   )rC  r~   rx   src_optr(  r7   rB  r8   scatter_gather_dtype_check  s    


rE  c                 C   s
   t | dS r  )r   r   r7   r7   r8   ensure_nonempty_dim"  s    rF  c           	         s0  ddl m} | dkr d S tt t kdd  d}t }t|D ].}t|}| krtq\|t|kr\d} qq\|sƈd urt|D ]$}t|}|t|krd} qqd urtt t kdd  t|  fdd nt|  fd	d d S )
Nr   r'  c                   S   s   dS NzCIndex tensor must have the same number of dimensions as self tensorr7   r7   r7   r7   r8   rW   .  rF   z%scatter_shape_check.<locals>.<lambda>FTc                   S   s   dS rG  r7   r7   r7   r7   r8   rW   H  rF   c                      s&   dj  dj  d  dj   S )NExpected index r5  r6  z and to be no larger than src r   r7   rn   rx   r~   rD  r7   r8   rW   L  s   c                      s   dj  dj  d   S )NrH  r5  r6  r   r7   r,  r7   r8   rW   R  s   )	r   r(  r{   rL   rY   rF  rn   r   r4  )	r~   rn   rx   rD  r(  Zis_wrong_shaper8  r   Zindex_d_sizer7   rI  r8   scatter_shape_check'  sF    


rJ  c                 C   s@   t ||  }td| || t| ||| |d ur<t|| d S )Nscatter)r   rn   rE  rJ  rA  )r~   rn   rx   r	  r?  r@  r:  r7   r7   r8   scatter_meta_implX  s
    rL  c                 C   s   t | |||d | | jS Nr=  rL  r|   r}   r~   rn   rx   r	  r7   r7   r8   meta_scatter_adda  s    rP  c                 C   s   t | |||d | S rM  rL  rO  r7   r7   r8   meta_scatter_add_g  s    rR  c                 C   s0   t |tjr|nd }t| |||| | | jS r2   )r\   rL   r   rL  r|   r}   r~   rn   rx   Zsrc_or_valuer   r	  r7   r7   r8   meta_scatterm  s    
rT  c                 C   s(   t |tjr|nd }t| |||| | S r2   )r\   rL   r   rL  rS  r7   r7   r8   meta_scatter_|  s    	rU          )queryr   r  	dropout_p	is_causalreturn_debug_maskrd  c              	   C   sN  |  d}|  d}|  d}	|  d}
| d}| dd}t|dd}tj|||	ftj| jd}|r|
dkrzdnd}t|	| }|dkrd}n|dkrd}tj|||	|f| j	| jd}ntjd| j	| jd}tj
jrtj rtjd	tjd
d}tjd	tjd
d}n$tjdtjd
d}tjd	tjd
d}||d d |	||||f	S )Nr   r   r   r0   r  @         r7   rh   )r   r  rL   r   rs   rO   rk   r  ceilrS   versionhipr   r|  ry   r  )rW  r   r  rX  rY  rZ  rd  r   	num_headsmax_seqlen_batch_qhead_dimmax_seqlen_batch_kquery_t	attention	logsumexpblocksize_cmax_seqlen_k
debug_maskseedoffsetr7   r7   r8   (meta__scaled_dot_product_flash_attention  sP    






rm  .)rW  	res_shapec                    s   t jkr.dd}t|dd}n`tg dfdddd fdd	 D } fd
d	tt D }tj|j	j
d|}|S )Nr   r   )r   r   r   r0   c                    s      |  S r2   r  )idx)rW  r7   r8   rW     rF   z,alloc_with_matching_layout.<locals>.<lambda>Tr   c                    s   g | ]} | qS r7   r7   )rA   ro  )rn  r7   r8   rE     rF   z.alloc_with_matching_layout.<locals>.<listcomp>c                    s   g | ]}  |qS r7   rw   rl  )	dim_orderr7   r8   rE     rF   r  )rX   r}   r  rL   r   sortedr   r   rs   rS   rk   r   )rW  rn  re  r  Zpermuted_shapeZfinal_permuter7   )rp  rW  rn  r8   alloc_with_matching_layout  s    
rr  )	rW  r   r  	attn_biascompute_log_sumexprX  rY  rZ  rd  c	              	   C   s   |  d}	|  d}
|  d}| d}| d}|	|
||f}t| |}tj|	|
|ftj| jd}tjdtjdd}tjdtjdd}||d d ||||d f	S Nr   r   r   r   r  r7   rh   r   rr  rL   rs   rO   rk   ry   )rW  r   r  rs  rt  rX  rY  rZ  rd  r  r  S_QS_KVD_Vrn  r  
logsum_exprk  rl  r7   r7   r8   (meta__scaled_dot_product_cudnn_attention  s0    





r{  )rW  r   r  rs  rX  rY  rZ  rd  c              	   C   s   |  d}|  d}	|  d}
| d}| d}||	|
|f}t| |}tj||	|
ftj| jd}tjdtjdd}tjdtjdd}||d d |
|||d f	S ru  rv  )rW  r   r  rs  rX  rY  rZ  rd  r  ZH_Qrw  rx  ry  rn  r  rz  rk  rl  r7   r7   r8   5meta__scaled_dot_product_fused_attention_overrideable  s0    





r|  )r  rW  r   r  r   rg  	cum_seq_q	cum_seq_kmax_qmax_krX  rY  philox_seedphilox_offsetrd  c                 C   sX   t |dddd}t |dddd}t |dddd}|||fS r  )rL   r   r  )r  rW  r   r  r   rg  r}  r~  r  r  rX  rY  r  r  rd  grad_qgrad_kgrad_vr7   r7   r8   'meta__scaled_dot_product_flash_backward6  s    r  )rW  r   r  rX  rY  	attn_maskrd  c                 C   sR   |  d}|  d}|  d}	t| }
tj||	|ftj| jddd}|
|fS )Nr   r   r   r  )r   rL   r   rs   rO   rk   r  )rW  r   r  rX  rY  r  rd  r   ra  rb  rf  rg  r7   r7   r8   0meta__scaled_dot_product_flash_attention_for_cpuR  s"    




r  )
r  rW  r   r  r   rg  rX  rY  r  rd  c
                 C   s   | d}
| d}| d}| d}| d}tj|
|||fd|j|jd}tj|
|||fd|j|jd}tj|
|||fd|j|jd}|||fS )Nr   r   r0   r   r   r   r   r0   r  )r   rL   empty_permutedrS   rk   )r  rW  r   r  r   rg  rX  rY  r  rd  r   ra  rc  len_qZlen_kr  r  r  r7   r7   r8   9meta__scaled_dot_product_flash_attention_for_cpu_backwardt  s0    







r  )rW  r   r  rs  rt  rY  rd  c                 C   s   |  dd} | dd}| dd}| d}| d}	| d}
|d}tj||	|
|| j| jd}tjjrtj	 r|r|	nd}n|rt
|	d d nd}tj||
|ftj| jd}| dd}tjdtjd	d}tjdtjd	d}||||fS )
Nr   r   r   rr  r   r  rU  r7   rh   )r  r   rL   rs   rS   rk   r_  r`  r   r|  r  r^  rO   ry   )rW  r   r  rs  rt  rX  rY  rd  r  r  ra  Kvr  logsumexp_dimrz  rk  rl  r7   r7   r8   ,meta__scaled_dot_product_efficient_attention  s(    



r  )r  rW  r   r  rs  r   rg  r  r  rX  grad_input_maskrY  rd  c                 C   s  | d}| d}| d}| d}| d}| d}tj||||fd|j|jd}tj||||fd|j|jd}tj||||fd|j|jd}d }|d ur|
d r| d}|d dkr|n|d |d  }t|  }||d< tj||j|jd}|d	d |f }||||fS )
Nr   r   r   r0   r  r  r   r*  .)r   rL   r  rS   rk   r   rs   )r  rW  r   r  rs  r   rg  r  r  rX  r  rY  rd  r   ra  r  rc  Z
head_dim_vr  r  r  r  	grad_biaslastDimlastDimAligned	new_sizesr7   r7   r8   +meta__scaled_dot_product_efficient_backward  sF    









 
r  )r  rW  r   r  r   rg  r  r  rs  r}  r~  r  r  rX  rY  rd  c                 C   s(   t |}t |}t |}|||fS r2   rb  )r  rW  r   r  r   rg  r  r  rs  r}  r~  r  r  rX  rY  rd  r  r  r  r7   r7   r8   'meta__scaled_dot_product_cudnn_backward  s    


r  )rW  r   r  r}  r~  r  r  rX  rY  rZ  rd  window_size_leftwindow_size_right	seqused_kalibi_slopesc                 C   s  |d u r|  dn
| d }|d u r0|  dn|}|d u rF| dn|}|  d}|  d}t| }|d u rtj|||ftj| jd}n"|  d}tj||ftj| jd}|	r
|dkrdnd}t|| }|dkrd}n|dkrd}tj||||f| j	| jd}ntjd| j	| jd}d	\}}tj
jrbtj rbtjd
tjdd}tjd
tjdd}n$tjdtjdd}tjd
tjdd}|||||fS )Nr   r   rr  r   r  r[  r\  r]  )NNr7   rh   r   )r   r{   rL   r   rs   rO   rk   r  r^  rS   r_  r`  r   r|  ry   r  )rW  r   r  r}  r~  r  r  rX  rY  rZ  rd  r  r  r  r  r   rb  rd  ra  rc  rf  rg  Ztotal_qrh  ri  rj  rk  rl  r7   r7   r8   meta__flash_attention_forward)  sR    




r  )r  rW  r   r  r   rg  r}  r~  r  r  rX  rY  r  r  rd  r  r  c                 C   s(   t |}t |}t |}|||fS r2   rb  )r  rW  r   r  r   rg  r}  r~  r  r  rX  rY  r  r  rd  r  r  
grad_querygrad_key
grad_valuer7   r7   r8   meta__flash_attention_backwardy  s    


r  )rW  r   r  r  cu_seqlens_qcu_seqlens_kmax_seqlen_qri  rX  custom_mask_typert  rd  causal_diagonalseqlen_kwindow_sizec                 C   s   |  d}|  d}| d}|  d}| d}tj||||| j| jd}|d urb| dd n|}|}|d ur|d us~J |}|d ur|n|}|
rt|d d nd}tj|||ftj| jd}tjdtjdd}tjdtjdd}||||||fS )	Nr   r   rr  r   r  rU  r7   rh   )	r   rL   rs   rS   rk   r  r^  rO   ry   )rW  r   r  r  r  r  r  ri  rX  r  rt  rd  r  r  r  r  r  rh  ra  r  r  Zlogsumexp_batch_dimZactual_max_seqlen_qZactual_max_seqlen_kr  rz  rk  rl  r7   r7   r8   !meta__efficient_attention_forward  s,    




r  )r  rW  r   r  r  r  r  r  ri  rg  rX  r  r  r  bias_requires_gradrd  num_splits_keyshared_storage_dqdkdvc                 C   sN  |rt |jd |jd kdd  t |jd |jd kdd  t jg |jdd d|jd |jd R |j|jd	}|d
d}|d
d}|d
d}nt |}t |}t |}|d ur2|d}|d dkr|n|d |d  }t	| }||d< t j||j|jd	}|dd |f }nt jd|jd}||||fS )Nr   c                   S   s   dS )Nz,seqlen must match for `shared_storage_dqdkdvr7   r7   r7   r7   r8   rW     rF   z4meta__efficient_attention_backward.<locals>.<lambda>r0   c                   S   s   dS )Nz3embedding dim must match for `shared_storage_dqdkdvr7   r7   r7   r7   r8   rW     rF   r   rr  r   r  r  r   r*  .r7   ra  )
rL   rY   r}   rs   rS   rk   r*  r   r   r   )r  rW  r   r  r  r  r  r  ri  rg  rX  r  r  r  r  rd  r  r  chunkr  r  r  r  r  r  r  r7   r7   r8   "meta__efficient_attention_backward  s:    *




 r  )r~   r  scale_ascale_br  scale_resultr  use_fast_accumc                    s~  dd }t  dko" dkfdd t |joH|jfdd tdkrJdd	 }	d
d }
dd }t |	 p|fdd t |
 p|fdd t dd dkfdd t dd dkodd dkfdd j\}djt jkrNjt jkpfjt j	kofjt j	k}
 dkr
 dkrt jt jkojt jkdd  n|r~jt j	krd}|d }nd}d}dd }|||}||dd }||| |  ||| | 
  krb
 krbt  dd  t  dd  nt d fdd nt jt jkojt jkdd  t  dko dkfd d dkr0ddkr0ddkr0dkr0t  o$ d!d  nt dfd"d |d urX|nj}t jdd|jd#S )$Nc                 S   s   | t jt jt jt jt jfv S r2   )rL   r.  Zfloat8_e5m2Zfloat8_e4m3fnuzZfloat8_e5m2fnuzZfloat4_e2m1fn_x2rv   r7   r7   r8   is_fp8_or_fp4_type  s    z*meta_scaled_mm.<locals>.is_fp8_or_fp4_typer   c                      s   d   d    S )Nz%Inputs must be 2D but got self.dim()=z and mat2.dim()=r   r7   r  r~   r7   r8   rW     rF   z meta_scaled_mm.<locals>.<lambda>c                      s   dj  d j  S )Nz?Expected both inputs to be fp8 or fp4 types but got self.dtype=z and mat2.dtype=rv   r7   r  r7   r8   rW   "  rF   r   c                 S   s   | d | d ko| d dkS r  r7   r  r7   r7   r8   is_row_major'  s    z$meta_scaled_mm.<locals>.is_row_majorc                 S   s   | d dko| d dkS r  r7   r  r7   r7   r8   is_col_major*  s    z$meta_scaled_mm.<locals>.is_col_majorc                 S   s   |  ddkp|  ddkS r  r   )Z	tensor_2dr7   r7   r8   has_zero_dim-  s    z$meta_scaled_mm.<locals>.has_zero_dimc                      s   d    S )Nz#self must be row_major, got stride r  r7   r   r7   r8   rW   2  rF   c                      s   d    S )Nz#mat2 must be col_major, got stride r  r7   r  r7   r8   rW   6  rF   r   r*  r   c                      s   d  d S )NzBExpected self.size(1) to be divisible by 16, but got self.size(1)=r   r   r7   r   r7   r8   rW   :  rF   c                      s   d j  S )Nz>Expected both dimensions of mat2 to be divisble by 16 but got r   r7   r  r7   r8   rW   >  rF   c                   S   s   dS )NzNFor tensorwise scaling, both scale_a and scale_b must be float (fp32) tensors.r7   r7   r7   r7   r8   rW   R  rF   rU  r\  c                 S   s   | | d | S r  r7   ri  r7   r7   r8   ceil_divb  s    z meta_scaled_mm.<locals>.ceil_divr,  c                   S   s   dS )Nzscale_a must be contiguousr7   r7   r7   r7   r8   rW   u  rF   c                   S   s   dS )Nzscale_b must be contiguousr7   r7   r7   r7   r8   rW   y  rF   Fc                	      s&   d  d   d d   d	S )NzTInvalid blockwise scaling configuration. For blockwise scaling, scale_a should have  elements, got z, scale_b should have r  r  r7   )expected_a_sizeexpected_b_sizer  r  r7   r8   rW   ~  s    c                   S   s   dS )NzKFor rowwise scaling, both scale_a and scale_b must be float (fp32) tensors.r7   r7   r7   r7   r8   rW     rF   c                      s   d   d  S )NzLFor non-tensorwise scaling, scale tensors must be 2D, but got scale_a.dim()=z and scale_b.dim()=r   r7   )r  r  r7   r8   rW     rF   c                   S   s   dS )Nz@Both scale_a and scale_b must be contiguous for rowwise scaling.r7   r7   r7   r7   r8   rW     rF   c                      sB   d  d d d d d d d d d dS )	Nz}Invalid scaling configuration. For tensorwise scaling, both scales should be scalar. For rowwise scaling, scale_a should be (z, 1), scale_b should be (1, z). Got scale_a.size()=(r   ra   r   z) and scale_b.size()=(rb   r   r7   )r0  r   r  r  r7   r8   rW     s    r  )rL   rY   rn   rS   r   r   r   r}   Zfloat8_e8m0fnur.  r{   r+  r   rs   rk   )r~   r  r  r  r  r  r  r  r  r  r  r  Z_kZis_blockwise_scalingZblock_size_kZblock_size_mnr  Znum_k_blocksZpadded_num_k_blocksZ
_out_dtyper7   )r  r  r0  r  r   r  r  r~   r8   meta_scaled_mm  s    	


$








	r  c                 C   s    t | ||||dd | | jS NT)r@  rN  r~   rn   rx   r	  r   r3  r7   r7   r8   meta_scatter_reduce_two  s    r  c                 C   s   t | ||||dd | S r  rQ  r  r7   r7   r8   meta_scatter_reduce__two  s    r  c                   sh   t d    k odkn   fdd   dkrLt j|t j jdS t j d|t j jdS )Nr   r   c                      s   d    S )Nz@The probabilty distributions dimensions must be 1 or 2, but got r   r7   r  r7   r8   rW     rF   z"meta_multinomial.<locals>.<lambda>r   r  )rL   rY   rn   rs   ry   rk   r   )r   num_samplesreplacementr   r7   r  r8   meta_multinomial  s    
r  c                 C   s   d}| D ]}||9 }q|S r  r7   )vsr  vr7   r7   r8   multiply_integers  s    
r  c                    s   t tkfdd d  t t k fdd t tdd dd  D ortdd D fdd d d \}}||gR S )Nc                      s   d  dt  S )Nz%It is expected output_size equals to , but got size r  r7   )num_spatial_dimsr  r7   r8   rW     rF   z'upsample_common_check.<locals>.<lambda>r   c                      s   d  dt  S )Nz$It is expected input_size equals to r  r  r7   )expected_input_dimsr  r7   r8   rW     rF   c                 s   s   | ]}|d kV  qdS rz  r7   r   r7   r7   r8   r_     rF   z(upsample_common_check.<locals>.<genexpr>c                      s   d  d S )NzDInput and output sizes should be greater than 0, but got input size z and output size r7   r7   )r  r  r7   r8   rW     s   )rL   rY   r   r  )r  r  r  r  Zchannelsr7   )r  r  r  r  r8   upsample_common_check  s    

*r  c                    sZ   t   dkp"t  dd   fdd t  |dd} |jt	 dS )Nr   r   c                      s   d    S )Nz>Non-empty 3D data tensor expected but got a tensor with sizes r   r7   r  r7   r8   rW     rF   z$upsample_nearest1d.<locals>.<lambda>r  r   
rL   rY   r{   r  r   r  r|   r  rG   r!   )r   r  Zscalesfull_output_sizer7   r  r8   upsample_nearest1d  s    


r  c           	         s   t   dkp"t  dd   fdd t  |dd} |}t } j	\}}}} j
jdkr~|dk r~t j}|j|d	}|S )
Nr   r   c                      s   d    S Nz>Non-empty 4D data tensor expected but got a tensor with sizes r   r7   r  r7   r8   rW     rF   z$upsample_nearest2d.<locals>.<lambda>r   r  r   r,  r   )rL   rY   r{   r  r   r  r|   rG   r!   r}   rk   rc   r   
contiguous)	r   r  scales_hscales_wr  r   r   rJ   Z
n_channelsr7   r  r8   upsample_nearest2d  s    



r  r$  r  r  r  r  c                    st   t ||dd tjdkfdd tdD ]*t  k fdd q0|jt	dS )Nr   r  r,  c                      s   d j  S NzFExpected grad_output to be a tensor of dimension 4 but got: dimension r  r7   r  r7   r8   rW     rF   z-upsample_nearest2d_backward.<locals>.<lambda>c                
      s&   d d   d d  S )NzCExpected grad_output to have the same shape as output; output.size() = z but got grad_output.size(r   r7   r  r$  r   r7   r8   rW   #  s    r   )
r  rL   rY   r   r   r   r|   r  rG   r!   r  r7   r  r8   upsample_nearest2d_backward  s    
	
r  c                    sZ   t   dkp"t  dd   fdd t  |dd} |jt	 dS )Nr   r   c                      s   d    S )Nz>Non-empty 5D data tensor expected but got a tensor with sizes r   r7   r  r7   r8   rW   5  rF   z$upsample_nearest3d.<locals>.<lambda>r0   r  r   r  )r   r  Zscales_dr  r  r  r7   r  r8   upsample_nearest3d/  s    


r  c           
      C   s   t | t j| t jd }}|d ur|d urt|ts:J t|tsHJ |j}| }	t||}t||}|||	 |||	 t	||d t	||d ||fS ||fS )Nrv   )r  r  )
rL   r   r   r\   r"   r}   r   r$   r   r&   )
r~   stablern   Z
descendingr   r   r  r   r   
out_strider7   r7   r8   	meta_sort?  s    	

r  c                    s  t jdkfdd t jjkfdd dd urt jdkfdd t  kfdd t jjkfdd t jdkfd	d d
   t   k fdd t tfddfD dd  d S )Nr   c                      s    j  dS Nz != 2r  r7   input_gatesr7   r8   rW   b  rF   z%rnn_cell_checkSizes.<locals>.<lambda>c                      s   j  d j  S N != r   r7   )hidden_gatesr  r7   r8   rW   e  rF   r   c                      s    j  dS )Nz != 1r  r7   )
input_biasr7   r8   rW   i  rF   c                      s      d  S r  r  r7   )
gates_sizer  r7   r8   rW   l  rF   c                      s   j  d j  S r  r   r7   )hidden_biasr  r7   r8   rW   p  rF   c                      s    j  dS r  r  r7   )prev_hiddenr7   r8   rW   r  rF   r   c                
      s,      dd d d d  d
S )Nr  r   z * z // z (aka rb   )r{   r   r7   )expected_prev_hidden_numelfactorr  r  r  r7   r8   rW   v  rF   c                 3   s   | ]}|j  j kV  qd S r2   ra  r@   r  r7   r8   r_   y  s   z&rnn_cell_checkSizes.<locals>.<genexpr>c                   S   s   dS )Nz%expected all inputs to be same devicer7   r7   r7   r7   r8   rW   }  rF   )rL   rY   r   r}   r   r{   r  )r  r  r  r  r  r  r7   )r  r  r  r  r  r  r  r  r8   rnn_cell_checkSizesZ  s8    





r  c                 C   sL   t | |||d| tj| tjd}tj|tjd}tj|tjd}|||fS )Nr,  r   )r  rL   r   r   )r  r  cxr  r  	workspacehycyr7   r7   r8   _thnn_fused_lstm_cell_meta  s
    r  c                 C   s*  t |dk}|r,t |}|d }| jd }n4|
r:| jd n| jd }|
rR| jd n| jd }d}|rhdnd}|dkrx|n|}|r||| g}n |
r|||| gn|||| g}| |}|	| ||g}|d u rtjd| jd}n
||}||	| ||g}|rdnd}| j|tjd}|||||fS )Nr   r   r   r   ra  rv   )r   r}   r|   rL   rs   rk   r  )r   r  Zweight_stride0Z
weight_bufhxr  r  hidden_sizeZ	proj_size
num_layersbatch_firstZdropouttrainbidirectionalbatch_sizesZdropout_stateZis_input_packed
seq_length
mini_batchZbatch_sizes_sumZnum_directionsout_sizer   r   Z
cell_shaper  r  Zreserve_shapeZreserver7   r7   r8   
_cudnn_rnn  s2    

r  c                 C   s   |r| j d n| j d }|r&| j d n| j d }|
}|rB|||gn|||g}| |}|d u rptjd| jd}n||j }|d u rtjd| jd}n||j }tjd| jtjd}||||fS )Nr   r   ra  r   )r}   r|   rL   rs   rk   r  )r   Zw0Zw1Zw2Zw3hx_Zcx_r   r  r  r  r  
has_biasesr  r  r  r  r  Zoutput_chanelsr   r   r  r  r  r7   r7   r8   mkldnn_rnn_layer  s     
r  c                    sR   | j dkr.t dkp dk fdd n t|  dk fdd d S )Nr   r   c                      s    d  S )Nz4: Expected reduction dim -1 or 0 for scalar but got r7   r7   rn   r  r7   r8   rW     rF   z'zero_numel_check_dims.<locals>.<lambda>c                      s    d  dS )Nz: Expected reduction dim z to have non-zero size.r7   r7   r  r7   r8   rW     rF   )r   rL   rz   r   )r~   rn   r  r7   r  r8   zero_numel_check_dims  s    
r  c                    sD   |d ur$t || }t||  nt| dk fdd d S )Nr   c                      s
     dS )Nz@: Expected reduction dim to be specified for input.numel() == 0.r7   r7   r  r7   r8   rW     rF   z%check_argmax_argmin.<locals>.<lambda>)r   rn   r  rL   rY   r{   )rw  r~   rn   r7   r  r8   check_argmax_argmin  s    

r  c                 C   sD   t d| | t| j|d ur"|fnd }t| ||}| j|tjdS )Nargmaxrv   )r  rG   rM  r}   rN  r|   rL   r   )r~   rn   rP  rn  r}   r7   r7   r8   argmax_argmin_meta	  s    r   c                 C   s$   |t jkrt j}t jd||||dS )Nr7   r   )rL   Zjaggedr  rs   )r   rS   rj   rk   rl   r7   r7   r8   scalar_tensor  s
    

r  c                 C   s   t ||  dd}|  dkr"dn| |}t| t||kdd  t| j}t|dkrh|||< | 	|| j	|tj
dfS )NTr3  r   r   c                   S   s   dS )Nzk not in range for dimensionr7   r7   r7   r7   r8   rW   #  rF   ztopk_meta.<locals>.<lambda>rv   )r   rn   r   rL   rl  rY   r   r}   r   r|   r   )r~   r/  rn   Zlargestrq  Z	sliceSizeZtopKSizer7   r7   r8   	topk_meta  s    

r  c           
      C   s@   |d us|d usJ d|  }|   }	tj||	j|	j|	jdS )Nz;segment_reduce(): Either lengths or offsets must be defined)rS   rk   rj   )r  rL   r   rS   rk   rj   )
rJ  r   rA  r   r<  r=  r>  r@  Zdata_contigZgrad_contigr7   r7   r8   meta__segment_reduce_backward+  s    r  c                    s   ddl m} t |  dd |  dkr4|  nd}t||dk||k fdd t| jd   | j d d   }|r|  dkr|	 d | 
|| j
|tjdfS )	Nr   )sym_andTr  r   c                      s
   d  S )Nz9kthvalue(): selected number k out of range for dimension r7   r7   r   r7   r8   rW   F  rF   zkthvalue_meta.<locals>.<lambda>rv   )r   r  r   rn   r   rL   rY   r   r}   r  r|   r   )r~   r/  rn   rP  r  ZdimSizer}   r7   r   r8   kthvalue_meta=  s    
$r  c                 C   s   | d ur| n|}t | dkdd  | }| d urPt |  |kdd  |d urpt | |kdd  t | |kdd  t | |kdd  t | dkdd  t | |d	 |d
  d kdd  d S )Nr   c                   S   s   dS N r7   r7   r7   r7   r8   rW   U  rF   z(checkLSTMBackwardSizes.<locals>.<lambda>c                   S   s   dS r  r7   r7   r7   r7   r8   rW   X  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   Z  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   [  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   \  rF   c                   S   s   dS r  r7   r7   r7   r7   r8   rW   ]  rF   r   r   r,  c                   S   s   dS r  r7   r7   r7   r7   r8   rW   ^  rF   )rL   rY   rn   r   r{   )grad_hygrad_cyr  r  r  Zdefined_gradZexp_sizer7   r7   r8   checkLSTMBackwardSizesS  s    r  c           	      C   s`   | d u r|d u rdS t | |||| tj|td}tj|td}|rR|jdddnd }|||fS )N)NNNr   r   F)rP  )r  rL   r   legacy_contiguous_memory_formatr  )	r	  r
  r  r  r  Zhas_biasZ
grad_gatesZgrad_cxr  r7   r7   r8   #_thnn_fused_lstm_cell_backward_implb  s    r  c                 C   sf   d }d }d }|d r"| |  }|d s2|d r\| |d| df}| |d}|||fS )Nr   r   r   r   r0  )r2  r1  r3  r4  r'  grad_weightr  r7   r7   r8   linear_backwardp  s    r  c                    s   t jdkr$jd ||  dks:J dj d| dd   fdd	}jd ||  }jd
 | }jd | }g jd d |||R }|}|j| d}|S )Nr   r  r   z'Invalid input shape for pixel_shuffle: z with upscale_factor = c                 S   s   t j| t jkS r2   r~  r  r7   r7   r8   r    s    z,meta_pixel_shuffle.<locals>.is_channels_lastc                      sN    r"t dkrtjS tjS n(jtjdr6tjS jtjdrJtjS d S r  )r   rL   r   r  r   r  r7   r  r~   r7   r8   r    s    z.meta_pixel_shuffle.<locals>.pick_memory_formatrr  r   r   )r   r}   r|   r  )r~   Zupscale_factorr  r   ZHrZWrr   r   r7   r  r8   meta_pixel_shuffle}  s     
r  c                 C   sZ   |  | j}| |j}| |j}| |j}| |j}| |j}|||||||fS r2   r  )r   Zweight0Zweight1Zweight2Zweight3r  Zcx_tmpr   Zhy_Zcy_Zgrad_output_r_optZgrad_hy_r_optZgrad_cy_r_optr   r  r  r  r  r  r  r  r  r  Zdiff_xZdiff_hxZdiff_cxZdiff_w1Zdiff_w2Zdiff_br7   r7   r8   mkldnn_rnn_layer_backward  s    r  )	out_int32r   c                C   s   t j| |rt jnt jt jdS )NrS   r   )rL   r   r  r   r   )r~   Z
boundariesr  r   r7   r7   r8   meta_bucketize  s
    r  d   c                    s   dt dkr(t fdd t dkrF rFtd tt t fdd t dk fd	d tttfd
d tttfdd tkdd  tj	 j
jdS )Nzhistc()r  c                      s   d j  dS )Nz%"histogram_cpu" not implemented for 'r  rv   r7   r  r7   r8   rW     rF   zmeta_histc.<locals>.<lambda>r   z%_histc_cuda with floating point inputc                      s    dt   S )Nz#: argument 'bins' must be int, not r=  r7   binsr  r7   r8   rW     rF   r   c                      s    d  S )Nz: bins must be > 0, but got r7   r7   r  r7   r8   rW     rF   c                      s     dt  S )Nz%: argument 'min' must be Number, not r=  r7   )r  r   r7   r8   rW     rF   c                      s     dt  S )Nz%: argument 'max' must be Number, not r=  r7   )r  r   r7   r8   rW     rF   c                   S   s   dS )Nz&{fn_name}: max must be larger than minr7   r7   r7   r7   r8   rW     rF   r   )r   rL   rY   r   rG   r  r\   r   r    rs   rk   rS   )r   r  r   r   r7   )r  r  r   r   r   r8   
meta_histc  s.    

r  c                    sd   t   |dd}t  dkp>tdd   dd  D  fdd  |jt	 d	S )
Nr   r  r   c                 s   s   | ]}|d kV  qdS rz  r7   )rA   r   r7   r7   r8   r_     rF   z,meta_upsample_bimode2d_aa.<locals>.<genexpr>r   c                      s   d    S r  r   r7   r  r7   r8   rW     rF   z+meta_upsample_bimode2d_aa.<locals>.<lambda>r   )
r  r   rL   rY   r{   r  r|   r  rG   r!   )r   r  r  r  r  r  r7   r  r8   meta_upsample_bimode2d_aa  s    

(

r  c                    st   t ||dd tjdkfdd tdD ]*tj   k fdd q0|jt	dS )Nr   r  r,  c                      s   d j  S r  r  r7   r  r7   r8   rW   
  rF   z4meta_upsample_bimode2d_aa_backward.<locals>.<lambda>c                
      s&   d d   d d  S )NzD
Expected grad_output to have the same shape as output; output.size(r  z
but got grad_output_size(r   r7   r  r7   r8   rW     s   r   )
r  rL   rY   r   r   r}   r|   r  rG   r!   )r$  r  r  r  r  r  r7   r  r8   "meta_upsample_bimode2d_aa_backward  s    	

r  c                 C   s\   t | dkdd  t | dkdd  t |jjdd  t |jjdd  d S )Nr   c                   S   s   dS )Nz%found_inf must be a 1-element tensor.r7   r7   r7   r7   r8   rW     rF   z<_amp_foreach_non_finite_check_and_unscale_.<locals>.<lambda>c                   S   s   dS )Nz%inv_scale must be a 1-element tensor.r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz!found_inf must be a float tensor.r7   r7   r7   r7   r8   rW   #  rF   c                   S   s   dS )Nz!inv_scale must be a float tensor.r7   r7   r7   r7   r8   rW   '  rF   )rL   rY   r{   rS   r   )r~   r;  Z	inv_scaler7   r7   r8   *_amp_foreach_non_finite_check_and_unscale_  s    r  c                 C   s   t |  }| |S r2   )r   r   r|   )r~   nanZposinfZneginfr:  r7   r7   r8   
nan_to_num,  s    r  c                 C   s   | j tjtjtjtjhvs,J d| j  d| j}t||}t||}||krR| S t| 	 }t| 
 }|| ||  ||< ||< || ||  ||< ||< | || | S )Nz>torch.transpose_: in-place transposition is not supported for z layout)rj   rL   r  Z
sparse_cscr  Z
sparse_bscr   r   r   r   r   r   )r~   Zdim0rX  ndimsr   r   r7   r7   r8   r  3  s&    	

r  c                 C   sx   | j }| jrD|  }|  }|dkr,|dks`J d| d| dn|  dks`J d| dt| d|dk rrdndS )	Nr   r   zEt_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got z sparse and z dense dimensionsz6t_ expects a tensor with <= 2 dimensions, but self is r  r   )r   r$  r%  r&  rn   r  )r~   r  r%  r&  r7   r7   r8   t_P  s    
r   )r  r   sidesorterc                   s   t tjdkp,jd d  jd d k fdd t d u pRjjkfdd t |dkpr| d |rt jnt j}t t jrt j |t j	dS t j
d	|jd
S d S )Nr   r   c                      s   dt j dt  j S )Nztorch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor and input value tensor must match, but we got boundaries tensor z and input value tensor r   r}   r7   )r~   sorted_sequencer7   r8   rW   s  s
    z#meta_searchsorted.<locals>.<lambda>c                      s(   dt  j dd ur t jng  S )Nz[torch.searchsorted(): boundary and sorter must have the same size, but got boundary tensor z and got sorter tensor r#  r7   )r$  r"  r7   r8   rW   ~  s
    r   zetorch.searchsorted(): side and right can't be set to opposites, got side of left while right was Truer  r7   r  )rL   rY   r   r}   r  r   r\   r   r   r   rs   rk   )r$  r~   r  r   r!  r"  rS   r7   )r~   r$  r"  r8   meta_searchsortedc  s&    
r%  c                    s(   t  t jt jt jfv fdd d S )Nc                      s
   d  S )Nz/Unsupported input type encountered for isin(): r7   r7   rv   r7   r8   rW     rF   z3_check_for_unsupported_isin_dtype.<locals>.<lambda>)rL   rY   r  Z
complex128Z	complex64rv   r7   rv   r8   !_check_for_unsupported_isin_dtype  s    
r&  c                 C   s   |  || df}|S )Nr   r0  )r$  r   num_weightsr  r  r  r7   r7   r8   meta_embedding_dense_backward  s    r(  c                 C   s>   |	r t | ||||||||
|
S t| ||||||||
|
S d S r2   )r.   Z_embedding_bag_sparse_backward!meta_embedding_bag_dense_backward)rJ  r   r=  r  r  maximum_indicesr'  r  r  r  r  r  r7   r7   r8   meta_embedding_bag_backward  s2    r+  c
                    sX   t  jt jt jt jt jfv  fdd |tkr@t |d u  | 	df}
|
S )Nc                      s   d j  S )Nz$Unsupported input type encountered: rv   r7   rJ  r7   r8   rW     rF   z3meta_embedding_bag_dense_backward.<locals>.<lambda>r   )
rL   rY   rS   r,  r-  r+  float64r  r|   r   )rJ  r   r  r  r*  r'  r  r  r  r  Zindex_grad_weightr7   r,  r8   r)    s    
r)  c           
      C   s~   |  d}t|tkd t|  dk t| dk | d}t| dk t| d|k | |f}	|	S )Nr   zHembedding_bag_backward: per_sample_weights only supported for mode='sum'r   r   )r   rL   rY   r  rn   r|   )
rJ  r  r   r=  r  r  r  Zembedding_featuresr  r   r7   r7   r8   .meta_embedding_bag_per_sample_weights_backward  s    


r.  )assume_uniqueinvertc                C   sx   t t| tpt|tdd  t| ts:t j| |jd} t|tsTt j|| jd}t| j t|j t j| t j	dS )Nc                   S   s   dS )Nz<At least one of elements and test_elements must be a Tensor.r7   r7   r7   r7   r8   rW     rF   zmeta_isin.<locals>.<lambda>ra  rv   )
rL   rY   r\   r   r  rk   r&  rS   r   r  )elementsZtest_elementsr/  r0  r7   r7   r8   	meta_isin  s    



r2  )r   r~   r1   c                 C   s4   t | dkdd  t|tjd\}}t j||dS )Nr   c                   S   s   dS )Nz,polygamma(n, x) does not support negative n.r7   r7   r7   r7   r8   rW     rF   z meta_polygamma.<locals>.<lambda>rU  rv   )rL   rY   r   r   rV  r   )r   r~   rJ   rD   r7   r7   r8   meta_polygamma  s    
r3  r   c                 C   s   t dd S )Nz.Tensor.item() cannot be called on meta tensors)r  r   r7   r7   r8   meta_local_scalar_dense  s    r4  )r~   r1   c                 C   s
   t | S r2   rb  r   r7   r7   r8   silu$  s    r5  c                 C   s    t | tjd\}}tj| |dS rT  )r   r   rV  rL   r   )r~   rJ   rD   r7   r7   r8   sigmoid*  s
    
r6  c                 C   sH  |   dk}|  dk}|rp|r>|d| d|dg}qt|d|dkd | d|dg}nn|rt|d| dkd | d|dg}n8t| d|dkd | d| d|dg}|p| j}d|j }|d | d | | }||kr(|d | |dg}	n|dg}	tj||	|| jd}
|
S )	Nr   r   r   z matrix batch sizes have to matchr   zbatched dimension has to matchr*  r  )rn   r   rL   rY   rS   itemsizer  rk   )r  r  offsr  Z
mat1_is_2dZ
mat2_is_2dr  	alignmentZsize_paddedr  r   r7   r7   r8    _create_grouped_mm_output_tensor4  s0    


r:  	mat_amat_br  r  r8  r  r  r  r  c	                    s  t |d u |d u kdd  |d uo*|d u}	|	r\t  jt jkoJjt jk fdd n*t  jt jkovjt jk fdd t   dv o dv  fdd   dk}
 dk}|	rdd	 }d
d }t |  fdd t |fdd dd }|d  |d |d ur|d urt |jt jkoZ|jt jkdd  d dd}d ur|
r|rjd nd}|d| d| |d|d| t |d u dd  |
s|r0t d u fdd d urDt  dkfdd t jt jkfdd nt d u dd  t |d u dd  t |d u pn|t jkdd  t	 |S )!Nc                   S   s   dS )Nz,Either both scale factors are given, or noner7   r7   r7   r7   r8   rW   d  rF   z)_meta_grouped_mm_common.<locals>.<lambda>c                      s   d j  dj  dS )Nz5Expected inputs of E4M3 FP8 type but got mat_a.dtype= and mat_b.dtype=r  rv   r7   r<  r=  r7   r8   rW   o  rF   c                      s   d j  dj  dS )Nz1Expected inputs of BF16 type but got mat_a.dtype=r>  r  rv   r7   r?  r7   r8   rW   t  rF   )r   r0   c                      s   d    d   S )Nz3Multiplicands must be 2D or 3D but got mat_a.dim()=z and mat_b.dim()=r   r7   r?  r7   r8   rW   y  rF   r   c                 S   s    |   }|d dko|d dkS Nrr  r   r   r  mat
mat_strider7   r7   r8   r    s    z-_meta_grouped_mm_common.<locals>.is_row_majorc                 S   s    |   }|d dko|d dkS r@  r  rA  r7   r7   r8   r    s    z-_meta_grouped_mm_common.<locals>.is_col_majorc                      s   d   dd   S )NzNExpected mat_a tensor to be row major in the last two dimensions, got strides rr  r  r7   )r<  r7   r8   rW     rF   c                      s   d   dd   S )NzQExpected mat_b tensor to be column major in the last two dimensions, got strides rr  r  r7   )r=  r7   r8   rW     rF   c                    s     d  d  }  d  dkrr  tdj d  krrt  | dk fdd nh  dkrĈ d  tdj  krt d  | dk fdd ntdfdd d S )	Nr   r*  r   c                      s   d d  d   dS )Nr   stride along % dim to be multiple of 16 bytes, got r  r7   r7   end_dimmat_namerC  r7   r8   rW     rF   zF_meta_grouped_mm_common.<locals>.check_valid_strides.<locals>.<lambda>c                      s$   d d d  d d   dS )Nr  rD  r   rE  r  r7   r7   rF  r7   r8   rW     rF   Fc                      s   d d j  dS )NzInvalid strides/sizes, got z for strides and z for sizes.r   r7   rA  r7   r8   rW     rF   )rn   Zelement_sizer   r   r}   rL   rY   )rH  rB  r9  r7   )rG  rB  rH  rC  r8   check_valid_strides  s*    
z4_meta_grouped_mm_common.<locals>.check_valid_stridesr<  r=  c                   S   s   dS )NzBoth scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.r7   r7   r7   r7   r8   rW     rF   r   c                    s
     dkrtt  dkfdd t fdd tjd  j  k fdd nt  dkfdd tddkfd	d tjd  jd k fd
d tjd  jd  k fdd d S )Nr   r   c                      s   d d    dS )Nr  z to be 1D tensor, but got 	D tensor.r   r7   rd  
scale_namer7   r8   rW     rF   z>_meta_grouped_mm_common.<locals>.check_scale.<locals>.<lambda>c                      s   d  dS )Nr  z to be contiguous.r7   r7   rL  r7   r8   rW     rF   r   c                      s(   d d j    dj d  dS )Nr  z	 to have r  r   z
 elements.r   r7   rB  rd  scale_multiplierrL  
scaled_dimr7   r8   rW     rF   c                      s   d d    dS )Nr  z to be 2D tensor, but got rJ  r   r7   rK  r7   r8   rW     rF   c                      s   d  dS )Nr  z( to be contiguous in the last dimension.r7   r7   rM  r7   r8   rW     rF   c                      s$   d d j d  dj d  dS )Nr  z batch dimension to be r   , got r  r   r7   )rB  rd  rL  r7   r8   rW     rF   c                      s(   d d j d   dj d  dS )Nr  z non-batch dimension to be r   rQ  r  r   r7   )rB  rd  rL  rP  r7   r8   rW     rF   )rn   rL   rY   r   r}   r   )rL  rd  rB  rP  rO  r7   rN  r8   check_scale  s:    



z,_meta_grouped_mm_common.<locals>.check_scaler   r  r  c                   S   s   dS )Nz:Scale result tensor provided, but it is not supported yet.r7   r7   r7   r7   r8   rW     rF   c                      s   d    d   dS )Nz/Offsets tensor not provided, but is needed for zD/zD multiplicand layouts.r   r7   r?  r7   r8   rW     rF   c                      s   d    dS )Nz.Offsets tensor must be 1D, but got offs.dim()=r  r   r7   r8  r7   r8   rW     rF   c                      s   d j  dS )Nz7Offsets tensor must be integer (int32) tensor, but got r  rv   r7   rS  r7   r8   rW     rF   c                   S   s   dS )NzJOffsets tensor provided, but is not needed for 3D/3D multiplicand layouts.r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz2Bias tensor provided, but it is not supported yet.r7   r7   r7   r7   r8   rW     rF   c                   S   s   dS )Nz4If output dtype provided, it must be torch.bfloat16.r7   r7   r7   r7   r8   rW     rF   )r   )
rL   rY   rS   r.  r-  rn   r+  r}   r  r:  )r<  r=  r  r  r8  r  r  r  r  ZscaledZmat_a_is_2dZmat_b_is_2dr  r  rI  rR  rO  r7   )r<  r=  r8  r8   _meta_grouped_mm_commonW  s    




!"




rT  )r<  r=  r8  r  r  r1   c              
   C   s   t | |d d ||d |dS )N)r  r  r8  r  r  r  rT  )r<  r=  r8  r  r  r7   r7   r8   
grouped_mm  s    	rV  c	           	      C   s   t | ||||||||d	S )N)r  r  r8  r  r  r  r  rU  r;  r7   r7   r8   meta_scaled_grouped_mm  s    rW  )rB   rn   half_to_floatr1   c                 C   sL   |r| j tjksJ tj| tjjd\}}|s2|n|}tj| |tjd}|S )NrU  r  )	rS   rL   rM   rG   r   r   rH   r   r   )rB   rn   rX  Zcomputation_dtyperD   r  r7   r7   r8   softmax-  s    
rY  c              	      s   t td dkfdd | jttd }| t |kfdd td  }t|D ]b t d d       d   }t |dk fdd || qtt j|| j| j	| j
t| dS )	Nr   r   c                      s   dt   S )Nz1Length of pad must be even but instead it equals r  r7   r  r7   r8   rW   A  rF   z'_constant_pad_nd_meta.<locals>.<lambda>c                      s   dt  d  dS )Nz`Length of pad should be no more than twice the number of dimensions of the input. Pad length is z while the input has z dimensions.r  r7   )l_inpr  r7   r8   rW   K  s   r   c                	      s6   d    d  dd   d   d	S )NzThe input size z, plus negative padding r   r   zG resulted in a negative output size, which is invalid. Check dimension z of your input.r7   r7   )r   r   l_diffr  pad_idxr7   r8   rW   V  s   
)rS   rk   rm   r   )rL   rY   r   r}   r   r   r   rs   rS   rk   rm   r!   )r   r  r  Zl_padr   Znew_dimr7   )r   r   r[  rZ  r  r\  r8   _constant_pad_nd_meta;  s8    
 r]  )r  r   r  r  r  r1   c           	      C   sx   |   dksJ d| j}|j}|jdkr6|d f}n.|jdkrR|d |d f}ng ||d R }| j}| j||dS )Nr   z'weight' must be 2-Dr   r   rv   )rn   r}   r   rS   r|   )	r  r   r  r  r  Zweight_shapeZindices_shaper   r  r7   r7   r8   	embeddinge  s    	

r^  )r   r=  max_lengthspadding_valuec                 C   s\   t |dksJ t |dks J |d jd d }|d }||g| jdd  R }| |S r  )r   r}   r|   )r   r=  r_  r`  r  r  r1  r7   r7   r8   $meta__jagged_to_padded_dense_forward}  s    ra  c                 C   s   t | t dd }|S )Nc                 S   s   t | tjdS r  rK   r   rV  r   r7   r7   r8   _f  s    z)_create_unary_float_meta_func.<locals>._fr<   r'   funcrc  r7   r7   r8   _create_unary_float_meta_func  s    rg  c                 C   s   t | t dd }|S )Nc                 S   s   t | |tjdS r  rb  )rB   r  r7   r7   r8   rc    s    z*_create_binary_float_meta_func.<locals>._frd  re  r7   r7   r8   _create_binary_float_meta_func  s    rh  c                    s<   t   fdd} j d}||_ttt||}|S )Nc                    s(    | g|R i |}t | j|j | S r2   r  )r~   rI   r8  r   r5   r7   r8   _fn  s    z#_register_inplace_meta.<locals>._fnrJ   )r   rd   r<   getattrr.   )r6   ri  Zinplace_namer7   r5   r8   _register_inplace_meta  s    rk  c                    sr   t j jk fdd  g}ttrbjdkrXt jjkfdd | t|dtj	iS )Nc                      s   dj  d j  S )Nr  z for `end`, but got dtype rv   r7   )re   rf   r7   r8   rW     rF   zlerp.<locals>.<lambda>r   c                      s   d j  dj  S )Nr  z for `weight`, but got dtype rv   r7   )rf   r  r7   r8   rW     rF   r>   )
rL   rY   rS   r\   r"   r   r   rK   r   rH   )rf   re   r  rI   r7   )re   rf   r  r8   lerp  s"    




rl  )r  c                C   s   t | ||tjdS r  r  r   Ztensor1Ztensor2r  r7   r7   r8   addcmul  s    
rn  c                C   s8   t t|jot|j dd  t| ||tjdS )Nc                   S   s   dS )N)zFInteger division with addcdiv is no longer supported, and in a future zErelease addcdiv will perform a true division of tensor1 and tensor2. z4The historic addcdiv behavior can be implemented as zA(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) zfor integer inputs and as z6(input + value * tensor1 / tensor2) for float inputs. z?The future addcdiv behavior is just the latter implementation: z4(input + value * tensor1 / tensor2), for all dtypes.r7   r7   r7   r7   r8   rW     rF   zaddcdiv.<locals>.<lambda>r=   )rL   rY   rG   r  rS   rK   r   rH   rm  r7   r7   r8   addcdiv  s    

ro  c                  C   s8  i } dD ]*}t | }|D ]}|| vr|| | |< qq|  D ]\}}t|tjjrTq<t|tsbJ |tjj	j
| tj| dr|t d v rt| dq<|jrq<| dv rq<d| v rt|| q<d| v rt|| q<d| v r
t|| q<d	| v r&t|| q<t|| q<d S )
N)rh   Zpost_autogradZpre_autogradZCompositeImplicitAutogradrh   z is a CompositeImplicitAutograd op, we shouldn't register meta function for it. Instead, we should let the decomposition run and write meta kernels for the base operators.>   zaten::constant_pad_ndzaten::clonezaten::_to_copyzaten::copy_zaten::rot90zaten::empty_stridedzaten::as_strided_scatterzmkldnn::zmkl::zonednn::zquantized::)r   itemsr\   rL   Z_opsZHigherOrderOperatorr   Zpy_impl_CZDispatchKeyr/   Z%_dispatch_has_kernel_for_dispatch_keyrw  r  Zis_view2_meta_lib_dont_use_me_use_register_meta_for_mkldnnimpl/_meta_lib_dont_use_me_use_register_meta_for_mkl2_meta_lib_dont_use_me_use_register_meta_for_onednn5_meta_lib_dont_use_me_use_register_meta_for_quantized'_meta_lib_dont_use_me_use_register_meta)Zactivate_meta_tablerc   registryZopoZop_overloadr6   r7   r7   r8   activate_meta  sH    ry  )F)T)F)F)NNN)N)NNNFr   r   r   )F)F)NN)NNN)NN)T)rv  )r  )r  T)F)F)F)FF)F)TT)r  )FTN)TFF)TF)r   )r^  N)r^  N)N)FF)N)r7   r   r  F)r7   r   FTN)r7   r   FTN)N)Fr   FNFr   )T)NF)r   F)r   )r   )r  r  FN)r  r  FN)r  r  FN)F)F)NN)F)r7   r   r  F)r7   r   r  F)NNNNN)r   NNr   )T)F)F)N)N)NNF)N)N)rV  FFN)rV  FFN)NrV  FFN)N)rV  FNN)NN)rV  FN)FN)N)NNNNN)NNN)FNNNN)NNF)NNNF)T)T)F)N)NN)NN)NNN)Nr   FNN)NN)NF)NNNN)r   TT)NNr   N)r   F)r  r   r   )NN)NN)NNN)r   )r   )r   )NNNNF)NNN)NNNNF)r   )r   FF)rV  (  r  r   collections.abcr   enumr   	functoolsr   r   typingr   r   r   r	   Ztyping_extensionsr
   rL   Ztorch._prims_commonr  rG   r   r   r   Ztorch._decompr   r   r   r   Z
torch._opsr   Ztorch._primsr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   Ztorch._prims_common.wrappersr#   r$   r%   r&   r'   r+  r(   r)   Ztorch.fx.experimentalr*   rW  Ztorch.utilsr+   r:   r,   r-   opsr.   libraryLibraryrw  r   r  r  r  r<   rK   rT   rZ   ZlinspaceZlogspacer  ru   Ztaker  r   r   r   r   r   r   r  r   r   ZcummaxZcumminr   r   r   r   r   r  r   Z_fft_c2cr   r   r   Z_fft_r2cr   ZrandpermZgenerator_outr   ry   r   randintr   r   Zlow_outr   Zrandr  Z_fft_c2rr  rX  r
  r  Z
unsqueeze_r  Z_sparse_semi_structured_linearrp  rS   r  Z_sparse_semi_structured_mmr  Z_sparse_semi_structured_addmmr!  Z_cslt_sparse_mmr2  Zindex_reducer7  Zindex_reduce_r8  Zindex_selectr;  Zsegment_reducerG  r   Z	unary_outrJ  rn   rQ  r   rR  rS  rY  rW  rZ  Z_assert_asyncr\  r   r^  Z_printr_  Z_make_dep_tokenrb  rf  Z_functional_sym_constrain_rangerj  rm  Z(_functional_sym_constrain_range_for_sizern  Z_functional_assert_asyncro  r   r|  r   r  r  r  r  Z_linalg_eighr  r  Z_linalg_eigvalsZlinalg_eigvalsr  Z
linalg_eigr  r  r  r  r  r  r  r  Zlinalg_inv_exr  Zlinalg_ldl_factor_exrX   r  Zlinalg_ldl_solver  Z	linalg_lur  Zlinalg_lu_factor_exr  Zlinalg_lu_solver  Z	lu_unpackr  r  Z	linalg_qrr  r  r  Z_linalg_svdr  r  r  r  r  Zlinalg_solve_triangularr  r  r  Z_linalg_detr  r  r	  r  Zreflection_pad1dr  Zreplication_pad1dr   r&  Zreflection_pad1d_backwardr*  Zreplication_pad1d_backwardr+  r6  Zreflection_pad2dr7  Zreplication_pad2dr8  Zreflection_pad2d_backwardr'  Zreplication_pad2d_backwardr;  rC  Zreflection_pad3drD  Zreplication_pad3drE  Zreflection_pad3d_backwardZreplication_pad3d_backwardrG  Z_pdist_forwardrO   rI  Z_pdist_backwardrL  Zbaddbmmr[  Z	bernoullir]  Z
bernoulli_r`  rH  ra  Zpoissonrc  Z_fused_moving_avg_obs_fq_helperrf  mmrk  rN  r   r}  r  Zmiopen_batch_normr  Zconvolutionr  rq  Z_has_mkldnnrr  r  Z_convolution_pointwiser  Z_linear_pointwiser  Zhas_mklrt  r  Z_mkl_linearr  ru  r  Zqconv2d_pointwiseZqconv_pointwiser  binaryr  Zqlinear_pointwiser  r  Zbinary_tensorr  Zlinear_dynamic_fp16Zlinear_relu_dynamic_fp16r  rv  r  Z
max_pool2dr  Zint4mm_packed_weight_cpur  r  Z
avg_pool2dr  r  Zavg_pool2d_backwardr  Z
avg_pool3dr  Zavg_pool3d_backwardr  Z_adaptive_avg_pool2dr  Z_adaptive_avg_pool3dr  Z_adaptive_avg_pool2d_backwardr  Z_adaptive_avg_pool3d_backwardr  r  Zadaptive_max_pool2dr  r  r  Zadaptive_max_pool3dr  r  r  Zrepeat_interleaver  r]   r  r  r  rx   Z_unsafe_indexr/  Zconvolution_backwardr5  Zaddbmmr7  Zrandint_liker9  Z_fused_adam_Z_fused_adamw_rM  Z_fused_adamrO  Z_int_mmrP  Z_convert_weight_to_int4packrW  Z#_convert_weight_to_int4pack_for_cpurY  Z_weight_int4pack_mmr^  Z_weight_int4pack_mm_for_cpur_  r`  ra  rx  Z_dyn_quant_pack_4bit_weightr  Z_dyn_quant_matmul_4bitr  Z_weight_int8pack_mmr  Z_cdist_forwardr  Z_cdist_backwardr  Z_embedding_bagr  Z_embedding_bag_forward_onlyr  r  Znansumr  ZmedianZ	nanmedianr  Z
dim_valuesr  r   r  Zlogical_not_r  repeatr  Zzero_r  Zmul_ZScalarZdiv_Zlogical_and_Zlogical_or_Zlogical_xor_r  Zadd_Zsub_r  roundZdecimalsr  r  
__rshift__r  
__lshift__r  Zzeror  r'  r  fillr  Zrelu_r  Z	_add_relur  Zrrelu_with_noiser  Zrrelu_with_noise_functionalr  Zrrelu_with_noise_r  Z	index_putZ_unsafe_index_putr  Zmasked_fill_r  Z_masked_scaler  Zmasked_scatter_r  Zmasked_scatterr  Zmasked_scatter_backwardr  Z
index_put_r  aliasr  r  Zbmmr  r  r  r  r  r  r  r  r  r  Z max_pool2d_with_indices_backwardr  Zmax_pool2d_with_indicesr  Zfractional_max_pool2dr  Zmax_pool3d_with_indicesr  Z max_pool3d_with_indices_backwardr  r  r  r  Zgrid_sampler_2d_backwardr  r!  r"  r#  r  Zonesr*  zerosr+  r*  r-  Zselect_scatterr/  Zslice_scatterr1  r   r4  r9  Zgatherr;  rA  rE  rF  rJ  rL  Zscatter_addrP  Zscatter_add_rR  rK  r	  r  Zvalue_reducerT  Zscatter_rU  Z#_scaled_dot_product_flash_attentionrm  rr  Z#_scaled_dot_product_cudnn_attentionr{  Z0_scaled_dot_product_fused_attention_overrideabler|  Z,_scaled_dot_product_flash_attention_backwardr  Z+_scaled_dot_product_flash_attention_for_cpur  Z4_scaled_dot_product_flash_attention_for_cpu_backwardr  Z'_scaled_dot_product_efficient_attentionr  Z0_scaled_dot_product_efficient_attention_backwardr  Z,_scaled_dot_product_cudnn_attention_backwardr  Z_flash_attention_forwardr  Z_flash_attention_backwardr  Z_efficient_attention_forwardr  Z_efficient_attention_backwardZSymIntr  Z
_scaled_mmr  Zscatter_reduceZtwoZtwo_outr  Zscatter_reduce_r  Zmultinomialr  r  r  r  Z_upsample_nearest_exact1dr  Z_upsample_nearest_exact2dr  Z"_upsample_nearest_exact2d_backwardr  Z_upsample_nearest_exact3dr   r  Zvalues_stabler  r  Z_thnn_fused_lstm_cellr  r  r  r  r  r  Zargminr   r  Ztopkr  Z_segment_reduce_backwardr  Zkthvaluer  r   r  r  r  r  Zpixel_shuffler  r  Z	bucketizeZ
Tensor_outr  Zhistcr  Z_upsample_bilinear2d_aaZ_upsample_bicubic2d_aar  Z _upsample_bilinear2d_aa_backwardr  r  r  r  r   Zsearchsortedr%  r&  Zembedding_dense_backwardr(  Z_embedding_bag_backwardr+  Z_embedding_bag_dense_backwardr)  Z*_embedding_bag_per_sample_weights_backwardr.  isinr2  Z	polygammar3  Z_local_scalar_denser4  r5  r6  r:  rT  Z_grouped_mmrV  Z_scaled_grouped_mmrW  Z_softmaxrY  Zconstant_pad_ndr]  r^  Z_jagged_to_padded_dense_forwardra  rg  rh  Zspecial_airy_aiZspecial_bessel_y0Zspecial_bessel_y1Zspecial_modified_bessel_i0Zspecial_modified_bessel_i1Zspecial_modified_bessel_k0Zspecial_modified_bessel_k1Z!special_scaled_modified_bessel_k0Z!special_scaled_modified_bessel_k1Zspecial_chebyshev_polynomial_tZspecial_chebyshev_polynomial_uZspecial_chebyshev_polynomial_vZspecial_chebyshev_polynomial_wZ&special_shifted_chebyshev_polynomial_tZ&special_shifted_chebyshev_polynomial_uZ&special_shifted_chebyshev_polynomial_vZ&special_shifted_chebyshev_polynomial_wZspecial_hermite_polynomial_hZspecial_hermite_polynomial_heZspecial_laguerre_polynomial_lZspecial_legendre_polynomial_prk  rl  rn  ro  Zlerp_Zaddcmul_Zaddcdiv_Ztorch._refs.nn.functionalZtorch._refs.specialry  r7   r7   r7   r8   <module>   sX  <(	8I
D

		6
;

'
	

  !"      0



#
	
	











	





'


2

*
&
"7
 (" 
  &
:  
/
 Z&5?'"
"

 

l%,$       Q,H
    X
N


.


*
" 
$m#
     c	







-


)
T\?6L+   &T

   g
e(

	    , 1	






   A   +    *	    *  , 7    K      /  7    #
 


' 7'

"  
0

*



	



(     )
      
 
(
  




E