
    JThP             !          S SK r S SKrS SKrS SKJr  S SKJr  S SKrS SKJ	r	  S SK
Jr  SSKJr  \" \R                  " SS	5      5      rS
 rS rS rS rS rS rS rS rS rS rS rS rSrS jrS rS rS r S r!SS.S jr"      SsS jr#        StS jr$ " S S 5      r%\" \S!9S" 5       r& SuS# jr'SvS$ jr(SSSSSS%SSS&.S'\RR                  S(\RR                  S)\RR                  S*\\RR                     S+\\RR                     S,\\RR                     S-\*S.\\+\\   \\   \\   4      S/\\,   4S0 jjr-SSSSSS%SSS&.S'\RR                  S(\RR                  S)\RR                  S*\\RR                     S+\\RR                     S,\\RR                     S-\*S.\\+\\   \\   \\   4      S/\\,   4S1 jjr.\" 5       (       Ga&  S SK/r/S SK0J1r2  \/Rf                  S2\2Rh                  S3\2Rh                  S4\2Rh                  S5\2Rh                  S6\2Rh                  S7\2Rh                  4S8 j5       r5\/Rf                  S3\2Rh                  S4\2Rh                  S6\2Rh                  S7\2Rh                  S9\2Rh                  4
S: j5       r6S; r7S<S<SS%SS=.S'\RR                  S>\RR                  S?\RR                  S,\\RR                     S-\*S.\\+\\   \\   \\   4      4S@ jjr8SS%SSSA.S(\RR                  S)\RR                  S,\\RR                     S-\*S.\\+\\   \\   \\   4      S/\\,   4SB jjr9\/Rf                  SC\2Rh                  SD\2Rh                  4SE j5       r:SrSF jr;   SwSG\RR                  SH\RR                  SI\RR                  SJ\\RR                     SK\<SL\*SM\\<   4SN jjr=\/Rf                  SO\2Rh                  SP\2Rh                  SQ\2Rh                  SR\2Rh                  SS\2Rh                  ST\2Rh                  S7\2Rh                  4SU j5       r>SV\RR                  SW\RR                  SX\RR                  SY\RR                  SZ\RR                  4
S[ jr?\/Rf                  S\\2Rh                  S]\2Rh                  SR\2Rh                  S^\2Rh                  SS\2Rh                  ST\2Rh                  S_\2Rh                  S7\2Rh                  4S` j5       r@ SxSV\RR                  SW\RR                  Sa\RR                  Sb\RR                  Sc\RR                  Sd\RR                  S/\,SZ\RR                  Se\*4Sf jjrA\/Rf                  Sg\2Rh                  Sh\2Rh                  Si\2Rh                  Sj\2Rh                  Sk\2Rh                  Sl\2Rh                  Sm\2Rh                  Sn\2Rh                  So\2Rh                  S3\2Rh                  S4\2Rh                  Sp\2Rh                  S6\2Rh                  S7\2Rh                  S9\2Rh                  S^\2Rh                  4 Sq j5       rBgSr;Sr9Sr8Sr=Sr?SrASrBg)y    N)	lru_cache)Optional)	warn_once)
has_triton   )get_meta*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                 (    U (       d  [        U5      eg N)
ValueError)condmsgs     P/var/www/auris/envauris/lib/python3.13/site-packages/torch/sparse/_triton_ops.pycheckr      s    o     c                 X    [        UR                  [        R                  :H  U  S35        g )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s'    		E$$$(RSr   c                 |    [        UR                  U:H  =(       a    UR                  R                  S:H  U  S35        g )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer   !   s3    		F6qxx}}6(KLr   c           	      .   [        UR                  5       S:  =(       a    UR                  5       S:  U  SUR                  5        SUR                  5        S35        UR                  SS  u  p4UR                  SS  u  pV[        XE:H  U  SU SU S35        g )Nr
   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhs_mklkr_ns          r   check_mm_compatible_shapesr*   (   s    		Q)3779>(   #	{*<SWWYKq	J YYrs^FBYYrs^FB	
(   "t#KB4q	Rr   c           	          [        UR                  U:H  =(       aE    UR                  [        R                  [        R                  [        R
                  4[        U6 -   ;   U  SU SUR                   S35        g )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r    )r   dtyper   halfbfloat16floattuple)r   r   r,   additional_dtypess       r   check_dtyper2   9   sp    		5 	SGGZZ5?P8QQS( 33D2E FGG9A	'	r   c           	      |   ^ [        U5      S:X  d   eS mU4S jn[        U" U5      U  SUS    SUS    S35        g )	Nr
   c                     X S-
  -  (       + $ Nr    )vs    r   is_power_of_two(check_blocksize.<locals>.is_power_of_twoG   s    QK  r   c                 X   > SnU  H   nUS:  =(       a    T" U5      =(       a    UnM"     U$ )NT   r6   )bres	blocksizer8   s      r   is_compatible_blocksize0check_blocksize.<locals>.is_compatible_blocksizeJ   s1    I?Ay'AJsC  
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r>   r?   r8   s      @r   check_blocksizerB   D   sX    y>Q! 
	*(01b1 OD 	Dr   c                 `    [        U R                  5       5      S:  a  U R                  5       $ U $ )a  Return input as a triton-contiguous tensor.

A triton-contiguous tensor is defined as a tensor that has strides
with minimal value smaller than or equal to 1.

While triton kernels support triton-non-contiguous tensors (all
strides being greater than 1) arguments, a considerable slow-down
occurs because tensor data is copied element-wise rather than
chunk-wise. Zero strides is assumed to not have this defect.
r   )minstride
contiguous)r   s    r   make_triton_contiguousrG   X   s)     188: ||~r   c                 v     [         R                  " S U 5       6 $ ! [         a    [        SU  S35         g f = f)Nc              3   >   #    U  H  oR                   S S v   M     g 7fNr!   r#   .0r   s     r   	<genexpr>'broadcast_batch_dims.<locals>.<genexpr>m   s     'Fgg   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrT   k   sB    U%%'Fg'FGG UexRSTUs    88c              '   l   #    U H*  n[        S 5      /UR                  5       -  nXU '   X4   v   M,     g 7fr   )slicer"   )r"   slice_rangerS   r   slicess        r   slicerrY   r   s3     +(!si s   24c              '      #    U HC  n[        S 5      /UR                  5       -  n[        X5       H  u  pVUc  M
  XdU'   M     X4   v   ME     g 7fr   )rV   r"   zip)dimsrX   rS   r   sdd_slices          r   multidim_slicerr`   y   sK     4[MAEEG#d+JA}! , d
 s
   4AAc               '   Z   #    U  H  nUv   UR                  5        S h  vN   M!     g  N	7fr   )rE   )rS   r   s     r   ptr_stride_extractorrb      s&     88: s   +)
+c           
   #     ^ ^^#    S[        T 5      s=::  a  S::  d   e   eS[        T5      s=::  a  S::  d   e   eSS KnU U4S jnU4S jnUR                  " U" 5       6  Hr  n[        T UT5       VVV	s/ s H  u  pxn	[	        Xx-
  U	5      PM     n
nnn	[        Xj5       VVs/ s H  u  p[        XU-   5      PM     nnnU
S S S2   /U" U5      Q7v   Mt     g s  sn	nnf s  snnf 7f)Nr      c               3   T   >#    [        TT5       H  u  p[        SX5      v   M     g 7f)Nr   )r[   range)fgmg	full_gridgrid_blockss     r   generate_grid_points.grid_partitioner.<locals>.generate_grid_points   s&     )[1FB2"" 2s   %(c              3   n   >#    TR                  5        H  u  p[        [        X U5      5      v   M     g 7fr   )itemsnextr`   )rX   r   t_dimstensor_dims_maps      r   generate_sliced_tensors1grid_partitioner.<locals>.generate_sliced_tensors   s-     (..0IAvq9:: 1s   25)rA   	itertoolsproductr[   rD   rV   )ri   rj   rq   ru   rk   rr   
grid_pointrg   gprh   gridgrX   s   ```          r   grid_partitionerr{      s     I#!#####K %A%%%%%#;  '')=)?@
/29j+/V
/VC/V 	 
 25Z1FG1F%F#1FG 4R4j:26::: A
 Hs   A0C%5C
C%"C<)C%c                    ^ SS S S2   nUc  UnO!S m[        U4S j[        X45       5       5      n[        X#U5       H  tpVU " U/UQ76   M     g )N)i  r}   rt   c                 6    U c  U$ [        S[        X5      5      $ r5   )maxrD   )rz   rh   s     r   valid_grid_dim%launch_kernel.<locals>.valid_grid_dim   s    y	 1c!j))r   c              3   8   >#    U  H  u  pT" X5      v   M     g 7fr   r6   )rM   rz   rh   r   s      r   rN    launch_kernel.<locals>.<genexpr>   s      
/NeaN1!!/Ns   )r0   r[   r{   )kernelrq   ri   rj   cuda_max_gridry   sliced_tensorsr   s          @r   launch_kernelr      sg    .tt4M#	*  
/2;/N
 
 "2" 	t%n%"r   c           
      <   U R                  5       R                  S5      nU R                  5       R                  S5      n[        U R	                  5       R                  S5      5      nU Vs/ s H  n[        UR                  S5      5      PM     nn[
        R                  " UR                  S S /S U 5       Q76 nS nU" X'S5      nU" X7S5      nU" XGUR                  SS  5      nU Vs/ s H  nU" XWUR                  SS  5      PM     nnX#U/UQ7$ s  snf s  snf )Nr   c              3   >   #    U  H  oR                   S S v   M     g 7frJ   rK   rL   s     r   rN   !prepare_inputs.<locals>.<genexpr>   s     ;7aWWSb\7rP   c                 `    U R                  X-   5      R                  S[        U5      S-
  5      $ )Nr   r   )broadcast_toflattenrA   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squash2prepare_inputs.<locals>.batch_broadcast_and_squash   s/    ~~j9:BBs:"
 	
r   rt   r!   )crow_indices	unsqueezecol_indicesrG   valuesr   rQ   r#   )	bsrdense_tensorsr   r   r   r   rS   batch_dims_broadcastedr   s	            r   prepare_inputsr      s,   ##%//2L//#--a0K#CJJL$:$:1$=>F?LM}!%akk!n5}GM #33Sb;7;

 .eL -[RWXK'RS(9F
 A 	#1aggbclK  
 f6w667 N,s   +$D+ Dc                 |   [        X/UQ76 nUR                  5       R                  US-   5      nUR                  5       R                  US-   5      nUR	                  5       R                  X1R	                  5       R
                  SS  -   5      nX1R
                  SS  -   n[        R                  " XEXgUR                  S9$ )Nr   r   r!   sizer   )	rT   r   r   r   r   r#   r   sparse_compressed_tensorr   )r   r   rS   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &v=W=K##%22;3FGL//#00u1DEKZZ\&&{ZZ\5G5G5L'LMF23'D))6SZZ r   c                     U R                   Gt p#nUX1S   -  US   XAS   -  US   /-   nU R                  U5      R                  SS5      $ )Nr   r   r   r!   )r#   view	transpose)r   r>   restmn	new_shapes         r   tile_to_blocksizer      s^    ''KTa	q\!	q\!	 I 66)&&r2..r   c                    U R                   S:  a#  U R                  S5      n U R                   S:  a  M#  U R                   S:  a  U R                  SU R                   S-
  5      n U R                   S:X  d   U R                  5       eU $ )zReturn tensor as 3D tensor by either prepending new dimensions to
the tensor shape (when ``tensor.ndim < 3``), or by collapsing
starting dimensions into the first dimension (when ``tensor.ndim >
3``).
rd   r   )ndimr   r   r#   )tensors    r   	as1Dbatchr      so     ++/!!!$ ++/{{Q6;;?3;;!)V\\)Mr   accumulatorsc                   US   nU R                   S:X  d   eU R                  u  pVnUS:X  Ga$  USS u  pUR                   S:X  d   eUR                  u  pnX{:X  d   eUc?  UR                  S   S-
  n[        R                  " XU4U R                  U R
                  S9nOUR                  u  pnX:X  d   eX:X  d   eUS-  (       d  US-  (       d  US-  (       d  [        cb  [        UR                  S   S-
  5       HA  nUU   nUUS-      n[        UU5       H!  nU	U   u  nnUU==   U U   UU   -  -  ss'   M#     MC     U$ [        XXU5        U$ US:X  Ga  UR                  n[        U5      nUR                  u  nnnUU-  S:X  d   eUSS u  nnnnnUS	   nUc\  UUR                  5       R                  5       S-   U-  -   n [        R                  " / USS
 QU PUP7U R                  U R
                  S9nOUR                  S
S u  n n!U!U:X  d   eUR                  n"[        U5      nUU-  nUS-  (       d  US-  (       d  US-  (       d  [        c  UR                  5         [        U5       H  n#[        UR                  S   5       H  nUU   R                  5       n$UU   R                  5       nUUS-      R                  5       n[        U$U5      u  n%n&UU#U%U%U-   2U&U&U-   24   n'[        UU5       HE  nUU   UU   nn[        UR                  5       U5      u  n(n)U'U U   UU#U(U(U-   2U)U)U-   24   -  -  n'MG     M     M     O[        U UUUUUUU5        UR                  U"5      $ US:X  GaF  UR                  n[        U5      nUR                  u  nnnUU-  S:X  d   eUSS u  nnnnUS	   nUc\  UUR                  5       R                  5       S-   U-  -   n [        R                  " / USS
 QU PUP7U R                  U R
                  S9nOUR                  S
S u  n n!U!U:X  d   eUR                  n"[        U5      nUU-  nUS-  (       d  US-  (       d  US-  (       d  [        c  [        U5       H  n#[        [        U5      5       H  n*[        UU*   R                  5       U5      u  n%n&U%U-  n+U&U-  n,UU+   R                  5       n-UU+S-      R                  5       n.UU#U%U%U-   2U&U&U-   24   n'[!        [        U-U.5      5       HR  u  n/nUU,U.-  UU,-
  U--  -   U/-      R                  5       n[        UU5      u  n(n)U'U U   UU#U(U(U-   2U)U)U-   24   -  -  n'MT     M     M     O<[        R"                  " SUR                  UR
                  S9n[        U UUUUUUU5        UR                  U"5      $ [%        U5      e)aB  Scattered matrix multiplication of tensors.

A scattered matrix multiplication is defined as a series of matrix
multiplications applied to input tensors according to the input
and output mappings specified by indices data.

The following indices data formats are supported for defining a
scattered matrix multiplication operation (:attr:`indices_data[0]`
holds the name of the indices data format as specified below):

- ``"scatter_mm"`` - matrix multiplications scattered in batches
  of tensors.

  If :attr:`blocks` is a :math:`(*  imes M  imes K) tensor,
  :attr:`others` is a :math:`(*     imes K  imes N)` tensor,
  :attr:`accumulators` is a :math:`(*       imes M  imes N)` tensor,
  and :attr:`indices = indices_data['indices']` is a :math:`(*
    imes 3)` tensor, then the operation is equivalent to the
  following code::

    c_offsets, pq = indices_data[1:]
    for r in range(len(c_offsets) - 1):
        for g in range(c_offsets[r], c_offsets[r + 1]):
            p, q = pq[g]
            accumulators[r] += blocks[p] @ others[q]

- ``"bsr_strided_mm"`` - matrix multiplications scattered in
  batches of tensors and a tensor.

  If :attr:`blocks` is a :math:`(Ms         imes Ks) tensor,
  :attr:`others` is a :math:`(*     imes K  imes N)` tensor,
  :attr:`accumulators` is a :math:`(*       imes M  imes N)` tensor, then
  the operation is equivalent to the following code::

    c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
    for b in range(nbatches):
        for i, r in enumerate(r_offsets):
            r0, r1 = divmod(r, N)
            acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
            for g in range(c_indices[i], c_indices[i+1]):
                p = p_offsets[g]
                q0, q1 = divmod(q_offsets[g], N)
                acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

  where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
  integer multiples of ``Ms`` and ``Ks``, respectively.

- ``"bsr_strided_mm_compressed"`` - matrix multiplications
  scattered in batches of tensors and a tensor. A memory and
  processor efficient version of ``"bsr_strided_mm"`` format.  If
  :attr:`blocks` is a :math:`(Ms    imes Ks) tensor, :attr:`others`
  is a :math:`(*    imes K  imes N)` tensor, :attr:`accumulators`
  is a :math:`(*    imes M  imes N)` tensor, then the operation is
  equivalent to the following code::

    c_indices, r_offsets, q_offsets, meta = indices_data[1:]
    for b in range(nbatches):
        for r in r_offsets:
            m = (r // N) // Ms
            n = (r % N) // Ns
            r0, r1 = divmod(r, N)
            c0, c1 = c_indices[m], c_indices[m + 1]
            acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
            for i, p in enumerate(range(c0, c1)):
                q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                q0, q1 = divmod(q, N)
                acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

  where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
  integer multiples of ``Ms`` and ``Ks``, respectively.

  Notice that the order of ``r_offsets`` items can be arbitrary;
  this property enables defining swizzle operators via
  rearrangements of ``r_offsets`` items..

Auxilary functions are provided for pre-computing
:attr:`indices_data`. For example,
:func:`bsr_scatter_mm_indices_data` is used to define indices data
for matrix multiplication of BSR and strided tensors.

Parameters
----------
blocks (Tensor): a 3-D tensor of first matrices to be multiplied

others (Tensor): a tensor of second matrices to be multiplied. If
  ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
  tensor of second input matrices to be multiplied. Otherwise, the
  second input matrices are slices of the :attr:`others` tensor.
indices_data (tuple): a format data that defines the inputs and
  outputs of scattered matrix multiplications.

Keyword arguments
-----------------

accumulators (Tensor, optional): a tensor of matrix product
  accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
  is a 1-D batch tensor of output matrices. Otherwise, output
  matrices are slices of the :attr:`accumulators` tensor.
r   rd   
scatter_mmr   Nr,   r   r;   bsr_strided_mmSPLIT_Nr!   bsr_strided_mm_compressed)r   )r   r#   r   zerosr,   r   _scatter_mm2rf   r   r   item_scatter_mm6zero_divmodr   rA   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_format_PMsKs	c_offsetspq_QKs_NsRMs_Ns_rg0g1rz   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper<   r_r0r1accq0q1jr   r   c0c1is0                                                   r   r   r     sC   H "!_N;;!JBB%$QR(	{{allyy"Q&A ;;6<<L ',,KAC99997b2gbL,@9??1-12q\q1u%r2Aa5DAq Ovay6!9'<<O ' 3  E	+	+||6",,1a2v{{;G;K8	9iDy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EAr7N7)// .'\7b2gbL,@ 1Xyq12A"1**,B"1**,B"1q5)..0B#B]FB&q"rBw,R"W'DEC"2r](|Yq\1!'!!4Bvay6!R"r'\2R<2O+PPP + 3  	   !344	6	6||6",,1a2v{{0<QR0@-	9iy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EAr7N7)// .'\7b2gbL,@1Xs9~.A#IaL$5$5$7;FBbAbA"1**,B"1q5)..0B&q"rBw,R"W'DEC )%B- 81%a"f!r/A&AA&EFKKM!'1Bvay6!R"r'\2R<2O+PPP !9 /  IOOI4D4DI 	   !344 ".11r   c           
      F   XgXX1S 1:X  Ga(  [         R                  R                  5       n[        SXX#U4US[         R                  S4S9nUb  UR
                  " S$0 UD6  U$ XU4S:X  aT  X44S:X  a  SnSnSnS	nSn
S	n	GOX44S
:X  a  SnSnSnS	nSn
S	n	GOX44S:X  a  SnSnSnS	nSn
S	n	GOX44S:X  a  SnSnSnSnSn
S	n	GOwXU4S:X  aT  X44S:X  a  SnSnSnSnSn
Sn	GOZX44S
:X  a  SnSnSnS	nSn
Sn	GOEX44S:X  a  S	nSnSnS	nSn
S	n	GO0X44S:X  a  SnSnSnS	nSn
S	n	GOXU4S:X  ad  X44S:X  a  S	nSnSnSnSn
Sn	OX44S
:X  a  SnSnSnSnSn
Sn	OX44S:X  a  SnSnSnS	nSn
Sn	OX44S:X  a  SnSnSnS	nSn
S	n	OX44S:X  a  SnSnSnSnSn
S	n	OXU4S:X  ad  X44S:X  a  S	nSnSnSnSn
Sn	OX44S
:X  a  S	nSnSnS	nSn
Sn	OX44S:X  a  S	nSnSnS	nSn
S	n	OkX44S:X  a  SnSnSnS	nSn
S	n	OWX44S:X  a  S	nSnSnSnSn
S	n	OCXU4S:X  a;  X44S:X  a  SnSnSnSnSn
Sn	O'X44S
:X  a  SnSnSnSnSn
Sn	OX44S:X  a  SnSnSnSnSn
S	n	Uc*  SSS	SSSSSSS.	R                  US5      nUS:  a  US:  a  SnX(-  nUc  [        US:  a  SOSU5      nUc  [        US:  a  SOSU5      nU
=(       d    Sn
U	c  [        X5      S:  a  SSSS.R                  US	5      n	O`[        X5      S:X  a  SSSS.R                  US	5      n	O:[        X5      S:X  a  SS	S.R                  US	5      n	OSSS.R                  US	5      n	U=(       d    S	nXc::  d   [        XcS95       eX~::  d   [        X~S95       eX0::  d   [        XS 95       eX::  d   [        X.S!95       eXA::  d   [        XS"95       e[        S$UUUU
U	US#.UD6$ )%Nr   r         ?version)   r   r   )r;   r;   r   r;      )    r   r
   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r;   r   r   r   r   r   r   r   i    r   r   r   )r;   r   r   )r;   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   r6   )	r   r   get_device_namer   float16updategetrD   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   s                  r   scatter_mm_metar    s    	JCvMjj0021"s+	
 KK % K !9
"x8#

	X%

	X%

	Z'

	AY*$x8#

	X%

	X%

	Z'

	AY+%x8#

	X%

	X%

	Z'

	Z'

	AY+%x8#

	X%

	X%

	Z'

	Z'

	AY+%x8#

	X%

	X%

	 

 #a* 	 9dG	
B~28RR0~28RR0qJq9tA1-11"a8IY$A1-11"a8IY#A**2q1IA**2q1IqJ<3V33<<3V33<7$D1$$77$D1$$77$D1$$7   r   c                    Uc  [         R                  nUc  UnUc  SnXyX1S 1:X  GaP  [         R                  R                  5       nXX#XES:H  US:H  US:H  4nXL a  UnOX4n[	        SUUUUU4S9nUc  US:w  a  [	        SUUUUS4S9nUc  XLa  [	        SUUXS4S9nUc  [	        S/ US S QSPUSS  Q7UUUS4S9nUc  XLa  [	        S/ US S QSPUSS  Q7UXS4S9n[        U=(       d    0 5       H=  nUU   nUS   nUS	   nUU-  nUU-  S:X  d  M"  UU::  d  M*  [        U5      nUU-  US	'   M?     Ub  UR                  " S0 UD6  U$ [        S
U < SU< SU< SU< SU< SU< SU< SU< SU< 35        U=(       d    [        X#-  S5      nU=(       d    SnU
=(       d    Sn
U	=(       d    Sn	[        SUUU
U	S.UD6$ )Nr   r   r   bsr_dense_addmmr   r
   *rd   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=r   )r   GROUP_SIZE_ROWr   r   r6   )
r   r   r   r   r   sortedr   r   r   r   )r   r   r   r   r   betaalphar   r  r   r   sparsityr,   	out_dtype_versionr  r  keyversion_dtyper   matching_metamkeymeta_r   split_ncs                             r   bsr_dense_addmm_metar    s   * }	J7D6Ajj002QB	419eqjA!M!,M}h7	
 <HO!!=#6	D <E2!3hs=SD <$!)#bq')3)QR)!=#6	M $)? (%-c"1g-s-SW-%c2	! }23%d+G	*Lq5A:!q&;D&'1fDO 4 KK % K
 t4QD!Ure6bU'D7(E88UHLi\[
 (QWaG#(qNqJQI %	
  r   c                   :    \ rS rSrSrS rS rS r\S 5       r	Sr
g)	TensorAsKeyiE  a  A light-weight wrapper of a tensor that enables storing tensors as
keys with efficient memory reference based comparision as an
approximation to data equality based keys.

Motivation: the hash value of a torch tensor is tensor instance
based that does not use data equality and makes the usage of
tensors as keys less useful. For instance, the result of
``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
the tensor results from `crow_indices` method call are equal, in
fact, these share the same data storage.
On the other hand, for efficient caching of tensors we want to
avoid calling torch.equal that compares tensors item-wise.

TensorAsKey offers a compromise in that it guarantees key equality
of tensors that references data in the same storage in the same
manner and without accessing underlying data. However, this
approach does not always guarantee correctness. For instance, for
a complex tensor ``x``, we have ``TensorAsKey(x) ==
TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
return False.
c                 t   S n[         R                  " U5      U l        UR                  [        R
                  L a  U" U5      U l        OUR                  [        R                  [        R                  1;   a2  U" UR                  5       5      U" UR                  5       5      4U l        OuUR                  [        R                  [        R                  1;   a2  U" UR                  5       5      U" UR                  5       5      4U l        O[        UR                  5      e[!        U R                  5      U l        g )Nc                    U R                   R                  (       d  U R                   R                  (       a   U R                   5       eU R                  5       U R	                  5       U R
                  U R                  5       U R                   4$ r   )r,   is_floating_point
is_complexdata_ptrstorage_offsetr#   rE   )objs    r   get_tensor_key,TensorAsKey.__init__.<locals>.get_tensor_key]  sb     		33syy7K7KWciiWL""$		

		 r   )weakrefref_obj_refr   r   stridedr  
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr  r  s      r   __init__TensorAsKey.__init__\  s    	&  C(::&%c*DHZZE,,e.>.>??s//12s01DH ZZE,,e.>.>??s//12s01DH
 &cjj11$((^
r   c                     U R                   $ r   )r+  r,  s    r   __hash__TensorAsKey.__hash__  s    zzr   c                     [        U[        5      (       d  gU R                  b  UR                  c  XL $ U R                  UR                  :H  $ )NF)
isinstancer  r  r  )r,  others     r   __eq__TensorAsKey.__eq__  sB    %--88uyy0 = xx599$$r   c                 "    U R                  5       $ )z'Return object if alive, otherwise None.)r#  r0  s    r   r  TensorAsKey.obj  s     }}r   )r+  r#  r  N)__name__
__module____qualname____firstlineno____doc__r-  r1  r6  propertyr  __static_attributes__r6   r   r   r  r  E  s+    ,#$J%  r   r  )maxsizec	           
      6   UR                   n	U	c   eU	R                  5       U	R                  5       pU
R                  n[        R
                  nU S:X  Ga%  X7-  n/ n[        R                  " X}US9U-  n[        X-  5       Hq  nU
U   R                  5       nU
US-      R                  5       nUU:X  a  M4  UR                  UUU XS-  -  R                  U5      UR                  UU-
  5      -   5        Ms     [        R                  " U5      nU
R                  5       nUR                  5       nUXC-  -  nUU-   R                  S5      nU
nUU   R                  U5      nUR!                  SSS9u  nnUU   nU UUU4$ U S:X  Ga  X7-  n/ n/ n[        R                  " X}US9U-  n[        X-  5       H  nU
U   R                  5       nU
US-      R                  5       nUU:X  a  M4  UR                  [        R                  " UUXS9R                  U5      5        UR                  UUU XS-  -  R                  U5      UR                  UU-
  5      -   5        M     [        R                  " U5      nU
R                  5       nUR                  5       nUXC-  -  nUU-   R                  S5      n[        R                  " U
S S [        R"                  " UU   R                  U5      S5      45      n[        R                  " U5      nU UUUU4$ U S	:X  a  UnS/n/ n[        U5       H  n[        X-  5       H  nU
U   R                  5       nU
US-      R                  5       n[        X>-  5       Hm  nUR                  US   U-   U-
  5        [        UU-
  5       H>  n UU -   n!UU!   R                  5       UX%-  -  -   X>-  -  U-   n"UR                  U!U"/5        M@     Mo     M     M     U [        R$                  " UXS9[        R$                  " UXS94$ ['        S
U < S35      e)Nr   r   r   rt   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r  r   r   r   r   int32arangerf   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr<   r   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datarZ    sD    *
-
-C?? # 0 0 2COO4E+  FKKM44\LLfEJqwAa%%'Ba!e$))+BRx  R#rv.66w?%%b2g./   IIm,	(--/088: BF+ULL$	 	'(<=OOPWX*//4/MWg&		9i@@	+	+\LLfEJqwAa%%'Ba!e$))+BRx  R=HOOPWX   R#rv.66w?%%b2g./   IIm,	(--/088: BF+ULL$	IIRa %&:;MMgV
	 IIm,		9iKK	<	'C	
xA17^!!_))+!!a%(--/qwA$$Yr]R%7"%<="27^F(^002Q!']BqwORSS"))1a&1 , ( $ ! LL-GLL=H
 	
 &~''ef
 	
r   c                 \   U R                  5       S:X  d   eU R                  S:X  d   eU R                  5       R                  SS nU R                  u  pVUu  pxUR                  SS u  pX:X  d   eUR                  SS R	                  5       n[        XVXU40 UD6nSU;  a:  UR                  U R                  [        R                  [        R                  1;   S9  US   n[        X%XjXxX[        U 5      5	      nUS:X  a  UR                  S	S
9  X4-   $ US:X  a  UR                  SS
9  X4-   $ U$ )zcComputes indices data for :func:`scatter_mm` used in BSR and
strided tensor matrix multiplication.
r   r
   r!   N
allow_tf32r\  r   r   T)is_compressedr   F)	dense_dimr   r   r#   numelr  r   r,   r   r   r.   rZ  r  )r   r5  r   
meta_inputr>   r   r   r   r   K_r   rO  r   r   r   s                  r   bsr_scatter_mm_indices_datarc    s4    ==?a88q==

""23'I99DAFBKKEB7N7{{3B%%'H19j9D:%syyU]]ENN,KKL9oG/1K<LL 44$'g%%	+	+%(g%%r   c           
         U R                   S:X  d   eUR                   S:  d   eU R                  S   U R                  S   UR                  S   penU R                  5       R                  SS nUc
  [        XSS9nUS   nUc>  [        R
                  " / UR                  SS QUPUP7U R                  U R                  S9nUR                  n	[        U5      nU R                  5       S:X  a  UR                  5         GOzUS	;   a+  UR                  5         [        U R                  5       XUS
9  GOIUS:X  Ga7  UR                  SS R                  5       n
[        R                  " X-  US   -  U-  US   -  US   US   4U R                  U R                  S9n[        U5      R                  SS5      R                  U
XgS   -  US   XWS   -  US   5      R!                  SS5      R#                  SS5      n[        U R                  5       XUS
9  UR%                  UR'                  SXUS   -  XgS   -  45      R!                  SS5      R)                  XU5      R                  SS5      5        O[+        U5      eUR                  U	5      $ )zBSR @ strided -> stridedr
   r!   rt   Nr   )r   r   r   >   r   r   r   r   r   )rd   r   r   r
   )r   r
   rd   r   )r   r#   r   rc  r   r   r,   r   r   _nnzr   r   r`  r   r   r   movedimr   copy_	unflattenreshaper   )r   r5  r   outr   r   r   r>   r   	out_shaperO  r   r   s                r   bsr_scatter_mmrl    so    88q==::??2		"u{{2BB

""23'I2'B
 "!_N
{kk'ekk#2''B'syy
 		I
C.C
xxzQ			J	J		3::<3G	<	';;s#))+{{1-2ilB!!
 ))::
 eYr2Tl"!l"! Wl WQ] 	 	3::<LQ		""HIaL0"!2DE Wl WX2&Yr2		
 ".1188Ir   Fr	  r
  
left_alpharight_alpharj  skip_checksmax_gridr   inputr   densern  ro  rj  rp  rq  r   c                j   Uc  UR                   [        R                  L a  SnUR                  5       nUR	                  5       S-
  nUR
                  U   nUR
                  S   n[        XU5      n[        R                  " UX4-   [        R                  UR                  S9n[        U UUUUUUUUU	U
S9$ )N_int_bsr_dense_addmmr   rt   r   rm  )r,   r   int8r   r"   r#   rT   r   rE  r   r  )rr  r   rs  r	  r
  rn  ro  rj  rp  rq  r   r   r   
batch_ndimr   r   original_batch_dims_broadcasteds                    r   ru  ru  \  s     {u{{ejj0''')!%%'!+
IIj!KKO*>vE*R'kk+qf4++<<

  r   c                	  ^^^
^ ^!^"^#^$^% SnUR                  5       nUR                  5       nUR                  5       nUR                  5       S-
  nUR                  XS-    u  nnUR                  US-   US-    nUR                  S   n[        XU5      nUc  UR                  UUU4-   5      nUR                  5       S:X  d  TS:X  d  US:X  d  US:X  d  US:X  aB  TS:X  a  UR                  5         U$ UR                  U 5        TS:w  a  UR                  T5        U$ Sm$Sm%Uc(  S	m$UR                  S
5      R                  " / UQUPUP76 nO*UR                  " / UQUPSP76 R                  " / UQUPUP76 nUc(  S	m%UR                  S
5      R                  " / UQUPUP76 nO*UR                  " / UQSPUP76 R                  " / UQUPUP76 nUR                  5       S   S:X  d   eUR                  5       S   S:X  d   eT
c[  [        SUR                  5       US   -  US   -  UU-  -  -
  S5      n[        UUUUS   US   TTUUR                   UR                   S9
m
Un[#        XX%Xg5      u  nnnn nnnnUu  m!m T
R%                  SUT!-  5      nUU-  m"Un['        UT!T"45      n['        UT T"45      n['        U T!T"45      n ['        UT!T"45      n['        UT!T"45      n[(        R*                  [,        R.                  [(        R0                  [,        R.                  [(        R.                  [,        R2                  [(        R2                  [,        R2                  [(        R4                  [,        R6                  [(        R6                  [,        R6                  0UR                      m#UR9                  S5      nUR9                  S5      S-
  nUR9                  S5      nUUU4nU	b*  [;        U	SS SSS2   5      SS[=        U	SS 5      -
  -  -   nOSnUSUSUSU SUSUSUSUS0nTS:w  d   eU U!U"UUU#U$U
U%4	S jn[?        UUUU5        URA                  5       URA                  5       :w  a*  UR                  UR                  UR                  5      5        U$ )zCompute

  out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

where left_alpha, right_alpha are (* + 1)-D tensors when
specified, otherwise, these are treated as tensors filled with
ones.
r  r   r
   rd   rt   Nr   FTr6   r!   )r  r,   r  r   r   r   r   NNr   Nrt   )r   r   )r   r   Nc                    >	 [         U    " / [        U6 QTPTP7TS:H  TS:g  TS:H  TT
TTTT[        R                  :H  TS.
T	D6  g )Nr   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLr\  	acc_dtype)_bsr_strided_addmm_kernelrb   tlfloat32)ry   r   BKBMBNr
  r	  dot_out_dtyper  r   r  s     r   r   bsr_dense_addmm.<locals>.kernel  ss    !$' 	
!>2	
	
 	
 	 AI!/1$

2#	
 	
r   )!r   r   r   r"   r#   rT   	new_emptyre  r   rg  mul_expandr   rE   roundr  r,   r   r   r   r   r   r  r  r.   float64rv  rE  r   r0   rA   r   r  )&rr  r   rs  r	  r
  rn  ro  rj  rp  rq  r   r   r   r   r   rw  r   r   r>   r   rx  r  
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsri   rj   rq   r   r  r  r  r  r  r  s&      ``     `                     @@@@@@r   r  r    s   , FZZ\F##%L//#K!!#a'J99Zq.1DAqZ!^j1n=IBA ';6&N#
{oo=AFG
xxzQ%1*Q!q&AF19IIK
 
 IIeqy
 __R(// 
,
./
12

  __L&ELqL!LSS 
,
./
12

 !oob)00 
,
./
12
 "&&N(GNNANUU 
,
./
12
 r"a'''#q(((|SXXZ)A,61EQOOQRS#aLaL++ii
 J 	s5kG	 FBhhy!r'*G	
gBK
C"b
*Ceb"X.Eeb"X.E":Bx8J#K"b:K 	rzz

rzzrzz

BHHRXX 
iiM 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}K[[	O A::
 
$ &/9kB
||~,,.. 	))**:*:;<r   IS_BETA_ZEROr  r  TILE_Kr  r\  c            
         [         R                  " SS9n [         R                  " SS9n!UUU -  -   UU!-  -   n"[         R                  " U"5      n#[         R                  " U"U-   5      n$U$U#-
  n%U%S:X  a  g [         R                  " SU5      n&[         R                  " SU5      n'UUU -  -   U	U#-  -   U
U&S S 2S 4   -  -   UU'S S S 24   -  -   n(UUU -  -   UU#-  -   n)UUU -  -   UU!-  -   UU&S S 2S 4   -  -   n*UUU -  -   UU'S S S 24   -  -   n+[         R                  " SU5      n,[	        U%5       GH1  n-[         R
                  " X44US9n.[         R                  " U)5      n/[	        SXV5       H  n0U0U,-   n1U1U:  n2[         R                  " U*UU1S S S 24   -  -   U2S S S 24   SS9n3[         R                  " U+UU/-  -   UU1S S 2S 4   -  -   U2S S 2S 4   SS9n4U.[         R                  " U3U4UUS9-  n.M     U(       a  U.U -  n.OU U.-  U[         R                  " U(5      -  -   n.[         R                  " U(U.R                  UR                  R                  5      5        U(U	-  n(U)U-  n)GM4     g )Nr   axisr   r,           maskr5  r\  r  )r  
program_idloadrF  rf   r   dotstoretor,   
element_ty)5r
  r	  r  r  r  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_strider  r\  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr  2  s   F MMq)	1- ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	 &23 :-. 	 )+,#m34 $&6q$w&??@ 	 )+,#&6tQw&??@ 	 		!V,wA-!?yQI  12I1-"]2	"QWW#&;ia>P&PPa
  WW#+i78+i4.@@A  4
 RVV
zY 	% ., U"	!I-rww?P7Q0QQ	 HH&	Z5E5E5P5P(QR !22!33M  r   r  c           	         [         R                  " SS9n[         R                  " SS9n[         R                  " SS9n[         R                  " SS9n[         R                  " SS9n [         R                  " UUUU U5      u  nnUUU-  -   UU-  -   n![         R                  " U!5      n"[         R                  " U!U-   5      n#U#U"-
  n$U$S:X  a  g [         R
                  " SU5      n%[         R
                  " SU5      n&U UU-  -   UU"-  -   UU%S S 2S 4   -  -   UU&S S S 24   -  -   n'UUU-  -   UU-  -   UU&S S 2S 4   -  -   UU%S S S 24   -  -   n(UUU-  -   UU-  -   UU-  -   UU%S S 2S 4   -  -   UU%S S S 24   -  -   n)UU	U-  -   U
U"-  -   n*[         R                  " UU4US9n+[        U$5       Ho  n,[         R                  " U'5      n-[         R                  " U*5      n.[         R                  " U(UU.-  -   5      n/U+[         R                  " U-U/UUS9-  n+U'U-  n'U*U
-  n*Mq     [         R                  " U)U+R                  UR                  R                  5      5        g )Nr
   r  r   r   r  r  r  r  num_programs	swizzle2dr  rF  r   rf   r  r  r  r,   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider  r  r  r\  r  r  r  col_block_pidr  r  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr    s   \ MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	  9,-$}45 %'74'@@A %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)TwA77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#  ( 	.11*2B2B2M2MNOr   c           
        ^ ^^^^^^^ UR                  S5      nUR                  S5      S-
  nX4nUb*  [        US S S S S2   5      SS[        US S 5      -
  -  -   nOS nUSUSUSU	SU
S0nUR                  [        R
                  [        R                  4;   a  [        R                  mS	mO[        R                  mS
mUUU UUUUU4S jn[        UUX5        g )Nr   rt   r   r
   r   )r   N)r   rt   )r   r|  TFc                 N   > [         U    " TTT/TQTPT	P[        U6 Q7TTSSS.6  g )Nr   r   )r  r\  r   r   )r  rb   )
ry   r   r  r\  r
  r	  r>   is_beta_zeror  tile_ks
     r   r   )_run_sampled_addmm_kernel.<locals>.kernelX  sY    !$' 	
   &~6 $%r   )r   r0   rA   r,   r   r-   r.   r  r  r  r   )r
  r	  r  r>   r  r  r   r   r   mat1mat2rq  r  r  ri   rj   rq   r   r  r\  s   ``````            @@r   _run_sampled_addmm_kernelr  4  s     KKN	#((,q0-	!TrT 23gSRTSTEVAV6WWKKI'')
 <<EJJ77

IJ

IJ	 	 	foyFr   g      ?)r	  r
  rj  rp  rq  r  r  c                n   Sn[        X5        [        XX5      n	U(       Gd  [        XU R                  5        [        XU R                  5        US:w  a/  U R                  [
        R                  L a  [        SU SU S35        U R                  [
        R                  La-  [        XU R                  5        [        XU R                  5        O[        XUR                  5        [        XU5        Ub  [        X5        [        XUR                  5        [        XU R                  5        [        UR                  U	R                  :H  =(       a!    UR                  5       U R                  5       :H  U SU	R                   SU	R                  5        SUR                   S	UR                  5        3	5        Uc  U	R                  UR                  S
S9nOUR                  U	5        UR                  5       S:X  d  UR                  5       S:X  a  U$ UR                  5       R                  SS  n
UR!                  S5      nUS:X  d  US:X  a!  UR                  5       R#                  U5        U$ Un[%        XQU5      u  ppn['        XS   U45      n['        X+U
S   45      n[)        U
6 n[+        UUUS:H  U
UUUUUUUU5        UR                  5       R-                  5       SS  UR-                  5       SS  :w  aF  UR                  5       R                  UR/                  UR                  5       R                  5      5        U$ )Nsampled_addmmr  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r!   rt   r   r   )r   r   r   r   r,   r   boolr   r2   r*   r#   re  r  rg  r`  r   r   r  r   r   r   r  rE   ri  )rr  r  r  r	  r
  rj  rp  rq  r   input_broadcastedr>   r  r  r   r   r   r  s                    r   r  r  i  s    !'4VDOu||4u||4s{u{{ejj8h1$7jk {{%**,F%++6F%++6F$**5&vT: -V$++6F5II!2!8!88WSXXZ5::<=Wh?@Q@W@W?X Y-->-C-C-E,F G++.99+_SXXZLR ;#&&tzz&=CII'(99;!sxxzQJJJL&&rs+	IIbM C<16JJLd#J 
8FsRV8W56 !a'89 9Q<'89i!CK	
$ %%',0DD%%fnnZ5F5F5H5N5N&OPr   )rj  rp  rq  r   c          	      N   SnU R                   SS  u  pxU(       d  [        X`5        [        X`UR                  5        [	        X`UR
                  [        R                  45        [        X`U5        UR                  S5      n	U R                  5       R                   SS  u  p[        XjU45        [        U	S-  (       + U SU	 S35        OUR                   SS  u  p[        X`U5      nUbx  U(       dq  XU	4-   n[        UR                   U:H  SU SUR                    S	35        [        UR                  5       =(       d     UR                  SS5      R                  5       S
5        Uc  UR!                  XU	4-   5      nU R#                  5       S:X  a  UR%                  5       $ ['        X USSUS9$ )Nbsr_dense_mmr!   rt   r;   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r    zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r
  r	  rj  )r#   r   r   r   r2   r,   r   rv  r*   r   r   rB   r   rT   is_contiguousr   r  re  r   r  )r   rs  rj  rp  rq  r   r   r   _klr   	row_blockr  _krrx  expected_out_shapes                  r   r  r    s     23V)ell3U[[5::-@&vE:

2A#&::<#5#5bc#: IF	$:;F
(03MN
 [[%FC*>vE*R'?;!@q6!I		//./z#))AG
 !!#Ls}}R'<'J'J'L" ;//"AF"JKC 88:?99; saaSIIr   MAX_ROW_NNZTILEc                    [         R                  " SS9n[         R                  " SS9n[         R                  " SS9nU X-  -   X--  -   n[         R                  " U5      n[         R                  " X-   5      nUU-
  nUS:X  a  g [         R                  " SU
5      nUUU-  :  nUXK-  -   X\-  -   X-  -   n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " USS9n[        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " USS9n[         R                  " UU:  UU5      nM     [         R                  " UU-
  5      n[         R                  " USS9n[        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " UU-
  5      nU[         R                  " USS9-  nM     [         R                  " UU-   UU-  R                  UR                  R                  5      US9  [        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " UU-
  5      n[         R                  " UU-   UU-  R                  UR                  R                  5      US9  M     g )Nr
   r  r   r   infr  )r  )r  r  r  rF  r/   r  r  r   rf   whereexpsumr  r,   r  )r  r  r  r  r  r  values_nnz_col_block_strider   r  r  r  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr    s'    MMq)	!}}!41- '34!12 	 
 WW45
''"9"OP "J.a<YYq$'
Gi// !-.%<= $% 	 77 :-Du

"RZZ. 	 xa0t$/A$J) 33Dww$z1U5\Mbn  "$q!9HH 22MCUM 0 ffX-.s#t$/A$J) 33Dww$z1U5\Mbn  &&M12CRVVCa((E 0 	 :-5[Z--889	

 t$/A$J) 33Dww$z1U5\Mbn  &&M12CHH$z1u  !1!1!<!<= 0r   c                 2  ^^^ Sn[        X 5        [        X U R                  5        U R                  5       S:X  d  U R	                  5       S:X  a  U R                  5       $ U R                  SS  u  p4U R                  5       nU R                  5       R                  SS  u  mmTc  [        R                  " U5      mO[        R                  " T5      mU R                  5       R                  S5      R                  SS5      nU R                  5       R                  SS5      R                  5       (       a  U R                  5       R                  5       nOU R                  5       nUR                  SS5      R                  5       R                  S5      R                  SS5      R!                  STUT-  5      nUR                  S   TUT-  4nS n	USS S24   SUS	0n
UUU4S
 jn[#        XX5        UR!                  STUT5      R                  SS5      R                   " U R                  5       R                  6 n[$        R&                  " U R                  5       R                  5       U R)                  5       R                  5       UU R                  U R*                  S9$ )Nbsr_softmaxr   r!   r   r|  rt   .r{  rz  c                 T   > [         U    " / [        U6 QTPTPTP[        ST5      P76   g )Ni   )r  rb   rD   )ry   r   r  max_row_nnzr   s     r   r   bsr_softmax.<locals>.kernel}  sJ    % %~6  	 E;'r   r   )r   r2   r,   re  r`  cloner#   r   tritonnext_power_of_2r   r   r   r   r  rF   ri  r   r   r   r   r   )rr  r  r   r   r   nnzr   r   ri   rj   rq   r   r  r   s    `          @@r   r  r  P  s3   'F5;;/::<1 2;;= {{23jjl$||~33BC8	9 003K 00=K))+55a8@@BG <<>##B+99;;\\^))+F\\^FR$Z\Yq\WQ^WRC)O4 	 \\!_ii@	 crc"MO	
	 	fyF NN2y#y9Yr2Wlln**, 	 -- &&(%%'<<
 	
r   queryr  value	attn_mask	dropout_p	is_causalscalec           	         Sn[        U(       + U S35        [        US LU S35        Uc   e[        UR                  [        R                  :H  U S[        R                   SUR                   S35        [	        XqU R
                  5        [	        XrU R
                  5        [	        XsU R
                  5        [        XqU R                  5        [        XrU R                  5        UR                  [        R                  La  [        XsU R                  5        [        X0UR                  SS5      S	S
S9nUc  U R                  S5      S:X  d  US	:X  a  [        S
U SU S35        Uc(  S[        R                  " U R                  S5      5      -  OUn	UR                  5       R                  U	5        [!        U5      n[        R"                  R$                  R'                  UR                  5       USS9  [)        X5      nU$ )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r    r!   rt   r  F)r	  rp  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   r   r   r   r   r   r2   r,   r  r  r   r   mathsqrtr   r  r  nn
functionaldropoutr  )
r  r  r  r  r   r!  r"  r   sdpascale_factors
             r   r$  r$    s    1)mx'NOPit#x/V%WX$$$ 0 00h ((-(8(8'9 :##,#3#3"4A7	
 	V%,,/VELL1V5F-F5;;/??%**,5;;7cmmB3#5
 =UZZ^q0ESL(8 @/ /
 9>q499UZZ^445<(4 ##DKKMY#MD(r   r   r   r   r  r   r   c           	      .   U U-  n[         R                  " SS9n[         R                  " SS9nUU-  nUU-  nUU-  [         R                  " SU5      -   nUU-  [         R                  " SU5      -   n[         R                  " SU5      nUUS S 2S 4   U-  US S S 24   U-  -   -   n UUS S 2S 4   U	-  US S S 24   U
-  -   -   n![         R                  " UUU-  -   5      n"[         R                  " UUS-   U-  -   5      n#U"U#:X  a  g [         R                  " UU4US9n$[        U"U#5       H  n%[         R                  " UU%U-  -   5      n&[         R                  " UU%U-  -   U-   5      n'[         R                  " U U&U-  -   5      n([         R                  " U!U'U-  -   5      n)U$[         R                  " U(U)UUS9-  n$M     UUU-  -   US S 2S 4   U-  US S S 24   U-  -   -   n*[         R                  " U*U$R                  UR                  R                  5      5        g Nr   r  r   r  )r  r\  )r  r  rF  r  r   rf   r  r  r  r,   r  )+r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1r  r   r   r\  r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs+                                              r   _scatter_mm2_kernelrK    s%   6 &[1%mm#r	bV^bii622V^bii622YYq!_q$wK/)BtQwK/,II
 q$wK/)BtQwK/,II
 WW^e.?&??@WW^uqy4E&EEF8HHff-]C	r2A[01A[0;>?AO 334AO 334A1*UUI  ++, 1d733T1W+ 556 	 		%5%;%;%F%FGHr   r   r   rY  
pq_indicesr   c                   ^^^ U R                   u  nmnUR                   u  pxm[        [        STS-  5      [        STS-  5      SSS9n	UUU4S jn
[        R                  [
        R                  [        R                  [
        R                  [        R                  [
        R                  [        R                  [
        R                  0UR                     nSU	;  a   U	R                  U[
        R                  :H  S9  [        U
   " TUTU U R                  S	5      U R                  S5      U R                  S5      UUR                  S	5      UR                  S5      UR                  S5      UUR                  S	5      UR                  S5      UR                  S5      TTR                  S	5      UUR                  S	5      UR                  S5      4S
U0U	D6  g )Nr;   r   r   r
   )r   r   r   r   c                    > TR                   S   S-
  [        R                  " TU S   5      [        R                  " TU S   5      -  S4$ )Nr   r   r   r   r#   r  cdiv)METAr   r   rY  s    r   ry   _scatter_mm2.<locals>.grid  sI      #a'AtH~.QX1OO r   r\  r]  r   r  )r#   r   r   r   r   r  r  r.   r  r,   r   rK  rE   )r   r   rY  rL  r   r   r   r   r  r   ry   r  r   r   s     `         @@r   r   r     s    <<Aq<<qr16?3r16?qTU
	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?D!MM!MM!MM!MM!MM!MM!"""a a a )	
* (+	
, -	
r   r   r^  r   r   c           	         UU-  nUU-  nUU-  n[         R                  " SS9n[         R                  " SS9nUU -  n UU -  n!UU-  n"UU"-  n#U#U-  n$[        UU$-
  U5      n%U$UU%-  -   n&UU"-  U%-  n'U&U-  [         R                  " SU5      -   n(U'U-  [         R                  " SU5      -   n)[         R                  " SU5      n*UU(S S 2S 4   U-  U*S S S 24   U-  -   -   n+UU U	-  -   U*S S 2S 4   U
-  U)S S S 24   U-  -   -   n,[         R                  " UU!-   5      n-U(       aY  U-U-  U-  n.U-U-  U-  n/[         R                  " UU.-   5      n0[         R                  " UU.-   S-   5      n1U/U1-  UU/-
  U0-  -   n2U1U0-
  n3O:[         R                  " UU!-   5      n2[         R                  " UU!-   S-   5      n4U4U2-
  n3UU2-   n5[         R
                  " UU4US9n6U(       a  U+W0U-  -  n+[        U35       Hl  n7[         R                  " U55      n8[         R                  " U,U8-   5      n9[         R                  " U+5      n:U6[         R                  " U:U9UUS9-  n6U+U-  n+U5S-  n5Mn     OUU2-   n;[        U35       H  n7[         R                  " U55      n8[         R                  " U,U8-   5      n9[         R                  " U;5      n<[         R                  " U+U<U-  -   5      n:U;S-  n;U5S-  n5U6[         R                  " U:U9UUS9-  n6M     UU--   U U-  -   U(S S 2S 4   U-  U)S S S 24   U-  -   -   n=[         R                  " U=U6R                  UR                  R                  5      5        g r.  )r  r  rD   rF  r  r   rf   r  r  r  r,   r  )>rO  r   r   r   r/  r0  r1  r2  r3  others_stride_Br5  r6  r7  accumulators_stride_Br9  r:  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrr^  r  r   r   r   r   r\  r   BLOCKS_MBLOCKS_Npid_t_rA  pid_br@  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mrB  rC  rD  rE  rF  rG  rH  r   r   r   r   r   r   r  r   q_ptrr  r  r   r   rI  p_ptrr   rJ  s>                                                                 r   _scatter_mm6_kernelrd  B  s   < '\<<A&mm#!("%0**+8k1:>s\12''L8V^bii622V^bii622YYq"q$wK/)BtQwK/,II
 o%&!T'{_,r$'{_/LLN 	 GGME)*aBAQ2A*+B*Q./BR7Q;",,Br'C./B.23Br'C"HHff-]C	R/))E3ZGGENGGEAI&GGENRVVqMj 	 (
   "B&E3ZGGENGGEAI&GGENGGEA$778

RVVqMj 	   ++, 1d733T1W+ 556	 	 		%5%;%;%F%FGHr   r   r   r   r   force_contiguousc	                   ^^^^ US   n	U R                   u  n
mnUR                   u  mpUR                   u  pnUU:X  d   eX-  mUT:X  d   eUUUU4S jn[        R                  [        R                  [        R
                  [        R                  [        R                  [        R                  [        R                  [        R                  0UR                     nSU;  a   UR                  U[        R                  :H  S9  UR                  S5      S:X  d   eTR                  S5      S:X  d   eUR                  S5      S:X  d   eUR                  S5      S:X  d   eU(       aI  U R                  5       n UR                  5       nUR                  5       (       d  UR                  5       nOUnOUn[        U   " TTUUU U R                  S5      U R                  S5      U R                  S5      UUR                  S5      UR                  S5      UR                  S5      UUR                  S5      UR                  S5      UR                  S5      UTUU4SU0UD6  U(       a(  UR                  5       (       d  UR                  U5        g g g )	Nr   c                    > TR                   S   T-  [        R                  " TU S   5      [        R                  " TU S   5      -  4$ )Nr   r   r   rO  )rQ  r   r   r   r   s    r   ry   _scatter_mm6.<locals>.grid  sD    "Q&BX/&++b$x.2QQ r   r\  r]  r   r   r
   r  )r#   r   r   r  r  r.   r  r,   r   rE   rF   r  rd  rg  )r   r   r   r   r   r   r   r   re  r   r   r   _Kr   B__Mr   ry   r  accumulators_r   r   r   s      `                @@@r   r   r     sX    y/\\
B<<2!''
Qww\Qww	 	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?"a'''"a'''"a'''"a''' &&(F&&(F--// , 7 7 9 ,(MD!MM!MM!MM!MM!MM!MM!  #  #  #)	
* (+	
, -	
2 L$>$>$@$@}- %Ar   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_strider~  r  r  r  r  r  c7           	      >   US:X  d   eUS:X  d   eUS:X  d   eU!S:X  d   e[         R                  " SS9n7[         R                  " SS9n8[         R                  " SS9n9[         R                  " SS9n:[         R                  " SS9n;[         R                  " U8U9U:U;U55      u  n8n9UUU7-  -   UU8-  -   n<[         R                  " U<5      n=[         R                  " U<U-   5      n>U>U=-
  n?[         R
                  " SU05      n@[         R
                  " SU25      nA[         R
                  " SU15      nBU UU7-  -   UU=-  -   UU@S S 2S 4   -  -   UUAS S S 24   -  -   nCUUU7-  -   UU9-  -   UUAS S 2S 4   -  -   UUBS S S 24   -  -   nDU#U$U7-  -   U%U8-  -   U&U9-  -   U'U@S S 2S 4   -  -   U(UBS S S 24   -  -   nEUU	U7-  -   U
U=-  -   nF[         R                  " U0U14U3S9nG[        U?5       Ho  nH[         R                  " WC5      nI[         R                  " WF5      nJ[         R                  " WDUUJ-  -   5      nKWG[         R                  " UIUKU4U3S9-  nGUCU-  nCUFU
-  nFMq     U-(       d  WGU*-  nGU.(       dG  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWBS S S 24   -  -   nLWG[         R                  " UL5      -  nGU/(       dG  UUU7-  -   UU8-  -   U U9-  -   U!W@S S 2S 4   -  -   U"WBS S S 24   -  -   nMWG[         R                  " UM5      -  nGU,(       ak  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWBS S S 24   -  -   nNU+(       a  WG[         R                  " WN5      -  nGOWGU)[         R                  " WN5      -  -  nG[         R                  " WEWGR                  U#R                  R                  5      5        g )Nr   r
   r  r   r  r  r  )Or  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  left_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_striderm  left_alpha_row_block_stridern  right_alpha_ptrright_alpha_batch_stridero  right_alpha_tiled_col_striderp  right_alpha_col_block_strider  r  r  r  r  r  r	  r
  r~  r  r  r  r  r  r  r  r  r\  r  r   r  r  r  r  r  r  r  r  r  r  inner_block_aranger  r  r  r  r  r  r  r  r  r  left_alpha_ptrsright_alpha_ptrs
input_ptrssO                                                                                  r   r  r  	  s   V +a///*a///+q000+q000MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:99Q6 !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)TwA77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#  & % )I56-=> .=> .0@D0II	J
 .0@q0IIJ   88!*Y67.>? />? /1A!T'1JJ	K
 /1A$'1JJK  (8 99$y01(=89 )=89 )+;AtG+DD	E
 )+;D!G+DDE   BGGJ$77  D277:+>$>>  	.11*2B2B2M2MNOr   r   )NNNNNN)NNNNNNNr   )r   )NN)r  FN)T)Cr&  osr!  	functoolsr   typingr   r   torch._dynamo.utilsr   torch.utils._tritonr   _triton_ops_metar   intgetenvr	   r   r   r   r*   r2   rB   rG   rT   rY   r`   rb   r{   r   r   r   r   r   r   r  r  r  rZ  rc  rl  Tensorr  r0   r   ru  r  r  triton.languagelanguager  jit	constexprr  r  r  r  r  r  r  r/   r$  rK  r   rd  r   r  r6   r   r   <module>r     s    	     ) * & .1II:A>. *

"(&U;2&0 7F	
/ >B m2l kl 
`FK K\ =>]
 ?]
B  ;BDX 

)-*."&MQ&<<&	& <<& && %,,'& 
%,,	& & uXc]HSM8C=HIJ& 4.&\ 

)-*."&MQf<<f	f <<f &f %,,'f 
%,,	f f uXc]HSM8C=HIJf 4.fR << ZZ{4 ll{4 ||	{4
 ||{4 {4> <<?{4@ LLA{4 {4z ZZAPN ||OAPP ||QAPR <<SAPT LLUAPV WAP APF3Gt &*!QUU||UllU llU ell#U U 5#x}!LMNUv '+!QU#5J\\5J||5J ell#	5J
 5J 5#x}!LMN5J tn5Jn ZZV \\V llV VpE
X !%,||,\\, ||, ELL)	,
 , , ,\ ZZDI<<DI<<DI <<DI* ||+DI, -DI. /DI0 LL1DI DIL4
4
4
 LL4
 LL	4

 ll4
l ZZoI LLoI* ||+oI, ||-oI. /oI0 1oI2 3oI4 LL5oI6 LL7oI oIt "&X.X.X. <<X. <<	X.
 <<X. <<X. X. llX. X.t ZZIPL &(\\MIPP &(\\QIPZ ')ll[IP^ ')ll_IPx \\yIPz {IP| ll}IP~ <<IP@ LLAIPB ||CIPD ||EIPF GIPH <<IIPJ LLKIPL MIPN OIP IPX KLM$(!LL $r   