
    Ph8                    	   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ ddlm	Z	 d dl
mZmZ  e ej                  dd            Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdldZd Zd Zd Zd Z dddZ!	 dmdZ"	 dndZ#	 dodZ$ G d d       Z% ee!      d"        Z&dpd#Z'dqd$Z(dddd%ddd&d'ejR                  d(ejR                  d)ejR                  d*eejR                     d+e*d,eeee   ee   ee   f      d-ee+   fd.Z, e       rd dl-Z-d dl.m/Z0 e-jb                  d/e0jd                  d0e0jd                  d1e0jd                  d2e0jd                  d3e0jd                  d4e0jd                  fd5       Z3e-jb                  d0e0jd                  d1e0jd                  d3e0jd                  d4e0jd                  d6e0jd                  f
d7       Z4d8 Z5d9 Z6d:d:dd%dd;d'ejR                  d<ejR                  d=ejR                  d*eejR                     d+e*d,eeee   ee   ee   f      fd>Z7dd%ddd?d@d(ejR                  d)ejR                  d*eejR                     d+e*d,eeee   ee   ee   f      d-ee+   dAe*fdBZ8e-jb                  dCe0jd                  dDe0jd                  fdE       Z9dldFZ:	 	 	 drdGejR                  dHejR                  dIejR                  dJeejR                     dKe;dLe*dMee;   fdNZ<e-jb                  dOe0jd                  dPe0jd                  dQe0jd                  dRe0jd                  dSe0jd                  dTe0jd                  d4e0jd                  fdU       Z=dVejR                  dWejR                  dXejR                  dYejR                  dZejR                  f
d[Z>e-jb                  d\e0jd                  d]e0jd                  dRe0jd                  d^e0jd                  dSe0jd                  dTe0jd                  d_e0jd                  d4e0jd                  fd`       Z?	 dsdVejR                  dWejR                  daejR                  dbejR                  dcejR                  ddejR                  d-e+dZejR                  dee*fdfZ@e-jb                  dge0jd                  dhe0jd                  die0jd                  d0e0jd                  d1e0jd                  dje0jd                  d3e0jd                  d4e0jd                  d6e0jd                  d^e0jd                  fdk       ZAydZ:dZ8dZ7dZ<dZ>dZ@dZAy)t    N)	lru_cache)
has_triton   )get_meta)OptionalTuple*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                     | st        |      y N)
ValueError)condmsgs     cC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/sparse/_triton_ops.pycheckr      s    o     c                 X    t        |j                  t        j                  k(  |  d       y )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s'    		E$$$(RSr   c                 r    t        |j                  |k(  xr |j                  j                  dk(  |  d       y )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer      s3    		F6qxx}}6(KLr   c           	      *   t        |j                         dk\  xr |j                         dk\  |  d|j                          d|j                          d       |j                  dd  \  }}|j                  dd  \  }}t        ||k(  |  d| d| d       y )Nr
   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrns          r   check_mm_compatible_shapesr*       s    		Q)3779>(   #	{*<SWWYKq	J IIbcNEArIIbcNEB	
b(   "t#KB4q	Rr   c           	          t        |j                  |k(  xrD |j                  t        j                  t        j                  t        j
                  ft        | z   v |  d| d|j                   d       y )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r    )r   dtyper   halfbfloat16floattuple)r   r   r,   additional_dtypess       r   check_dtyper2   1   sl    		5 	_GGU^^U[[AEK\D]]^( 33D2E FGG9A	'r   c           	      x    t        |      dk(  sJ d fd}t         ||      |  d|d    d|d    d       y )	Nr
   c                     | | dz
  z   S Nr    )vs    r   is_power_of_twoz(check_blocksize.<locals>.is_power_of_two>   s    QK  r   c                 @    d}| D ]  }|dk\  xr  |      xr |} |S )NT   r6   )bres	blocksizer8   s      r   is_compatible_blocksizez0check_blocksize.<locals>.is_compatible_blocksizeA   s1    I?Ay'AJsC  
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r=   r>   r8   s      @r   check_blocksizer@   ;   sX    y>Q! 
	*(01b1 OD 	Dr   c                 ^    t        | j                               dk7  r| j                         S | S )a  Return input as a triton-contiguous tensor.

    A triton-contiguous tensor is defined as a tensor that has strides
    with minimal value equal to 1.

    While triton kernels support triton-non-contiguous tensors (all
    strides being greater than 1 or having 0 strides) arguments, a
    considerable slow-down occurs because tensor data is copied
    element-wise rather than chunk-wise.
    r   )minstride
contiguous)r   s    r   make_triton_contiguousrE   O   s)     188:! ||~r   c                 r    	 t        j                  d |D         S # t        $ r t        d|  d       Y y w xY w)Nc              3   :   K   | ]  }|j                   d d   y wNr!   r#   .0r   s     r   	<genexpr>z'broadcast_batch_dims.<locals>.<genexpr>d   s     'Fgg   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrQ   b   sB    U%%'Fg'FGG UexRSTUs    66c              '   l   K   |D ]+  }t        d       g|j                         z  }||| <   ||    - y wr   )slicer"   )r"   slice_rangerP   r   slicess        r   slicerrV   i   s8     +(!si s   24c              '      K   |D ]B  }t        d       g|j                         z  }t        | |      D ]  \  }}|	|||<    ||    D y wr   )rS   r"   zip)dimsrU   rP   r   sdd_slices          r   multidim_slicerr]   p   sT     4[MAEEG#dF+JAw}! , d
 s
   7AAc               '   V   K   | D ]  }| |j                         E d {      y 7 wr   )rC   )rP   r   s     r   ptr_stride_extractorr_   y   s'     88: s   )')c           
   #      K   dt               cxk  rdk  sJ  J dt              cxk  rdk  sJ  J dd l} fd}fd} |j                   |        D ]p  }t         |      D 	cg c]  \  }}}	t	        ||z
  |	       }
}}}	t        ||
      D cg c]  \  }}t        |||z          }}}|
d d d   g ||       r y c c}	}}w c c}}w w)Nr      c               3   T   K   t              D ]  \  } }t        d| |        y w)Nr   )rX   range)fgmg	full_gridgrid_blockss     r   generate_grid_pointsz.grid_partitioner.<locals>.generate_grid_points   s+     )[1FB2r"" 2s   %(c              3   n   K   j                         D ]  \  }}t        t        || |              y wr   )itemsnextr]   )rU   r   t_dimstensor_dims_maps      r   generate_sliced_tensorsz1grid_partitioner.<locals>.generate_sliced_tensors   s2     (..0IAvvvq9:: 1s   25)r?   	itertoolsproductrX   rB   rS   )rf   rg   rm   rp   rh   rn   
grid_pointrd   gpre   gridgrU   s   ```          r   grid_partitionerrv      s     I#!#####K %A%%%%%#; (i'')=)?@
36y*k3Z[3ZZRRBGR 3Z[14Z1FG1FA%BF#1FG 4R4j:26::: A[Gs   A*C/C
	CC5'Cc                     dd d d   }||}n!d t        fdt        ||      D              }t        |||      D ]  ^}} | |g|   y )N)i  rx   ro   c                 6    | |S t        dt        | |            S r5   )maxrB   )ru   re   s     r   valid_grid_dimz%launch_kernel.<locals>.valid_grid_dim   s!    y	 1c!Rj))r   c              3   6   K   | ]  \  }} ||        y wr   r6   )rK   ru   re   r{   s      r   rL   z launch_kernel.<locals>.<genexpr>   s!      
/NeaN1b!/Ns   )r0   rX   rv   )kernelrm   rf   rg   cuda_max_gridrt   sliced_tensorsr{   s          @r   launch_kernelr      si    .tt4M#	*  
/2;/N
 
 "2)[/!Z~t%n% "[r   c           
      8   | j                         j                  d      }| j                         j                  d      }t        | j	                         j                  d            }|D cg c]  }t        |j                  d             }}t        j                  |j                  d d gd |D         }d } |||d      } |||d      } ||||j                  dd        }|D cg c]  } ||||j                  dd         }}|||g|S c c}w c c}w )Nr   c              3   :   K   | ]  }|j                   d d   y wrH   rI   rJ   s     r   rL   z!prepare_inputs.<locals>.<genexpr>   s      Hg_fZ[QTRT_frM   c                 b    | j                  ||z         j                  dt        |      dz
        S )Nr   r   )broadcast_toflattenr?   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squashz2prepare_inputs.<locals>.batch_broadcast_and_squash   s1    ~~j>9:BBs:"
 	
r   ro   r!   )crow_indices	unsqueezecol_indicesrE   valuesr   rN   r#   )	bsrdense_tensorsr   r   r   r   rP   batch_dims_broadcastedr   s	            r   prepare_inputsr      s;   ##%//2L//#--a0K#CJJL$:$:1$=>F?LM}!%akk!n5}GM #33FLL"4EhHg_fHgh

 .,eL -+UK (&RS(9F V]U\PQ"1&<aggbclKU\   f6w665 N,s   +!D*Dc                    t        | |g| }|j                         j                  |dz         }|j                         j                  |dz         }|j	                         j                  ||j	                         j
                  dd  z         }||j
                  dd  z   }t        j                  |||||j                        S )Nr   r   r!   sizer   )	rQ   r   r   r   r   r#   r   sparse_compressed_tensorr   )r   r   rP   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &vs=W=K##%22;3FGL//#00u1DEKZZ\&&{SZZ\5G5G5L'LMF23'D)),VRV_b_i_ijjr   c                     | j                   ^ }}}|||d   z  |d   ||d   z  |d   gz   }| j                  |      j                  dd      S )Nr   r   r   r!   )r#   view	transpose)r   r=   restr&   r)   	new_shapes         r   tile_to_blocksizer      sd    ''KT1a	Yq\!	Yq\!	 I 66)&&r2..r   c                    | j                   dk  r!| j                  d      } | j                   dk  r!| j                   dkD  r| j                  d| j                   dz
        } | j                   dk(  sJ | j                         | S )zReturn tensor as 3D tensor by either prepending new dimensions to
    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
    starting dimensions into the first dimension (when ``tensor.ndim >
    3``).
    ra   r   )ndimr   r   r#   )tensors    r   	as1Dbatchr      so     ++/!!!$ ++/{{Q6;;?3;;!)V\\)Mr   accumulatorsc                   |d   }| j                   dk(  sJ | j                  \  }}}|dk(  r|dd \  }}	|j                   dk(  sJ |j                  \  }
}}||k(  sJ |B|j                  d   dz
  }t        j                  |||f| j                  | j
                        }n|j                  \  }}}||k(  sJ ||k(  sJ |dz  s|dz  s|dz  st        ^t        |j                  d   dz
        D ]>  }||   }||dz      }t        ||      D ]   }|	|   \  }}||xx   | |   ||   z  z  cc<   " @ |S t        | |||	|       |S |dk(  r|j                  }t        |      }|j                  \  }}}||z  dk(  sJ |dd \  }}}}}|d	   }|^||j                         j                         dz   |z  z   } t        j                  g |dd
 | || j                  | j
                        }n|j                  d
d \  } }!|!|k(  sJ |j                  }"t        |      }||z  }|dz  s|dz  s|dz  st        |j                          t        |      D ]  }#t        |j                  d         D ]  }||   j                         }$||   j                         }||dz      j                         }t        |$|      \  }%}&||#|%|%|z   |&|&|z   f   }'t        ||      D ]D  }||   ||   }}t        |j                         |      \  }(})|'| |   ||#|(|(|z   |)|)|z   f   z  z  }'F   nt        | |||||||       |j                  |"      S |dk(  r1|j                  }t        |      }|j                  \  }}}||z  dk(  sJ |dd \  }}}}|d	   }|^||j                         j                         dz   |z  z   } t        j                  g |dd
 | || j                  | j
                        }n|j                  d
d \  } }!|!|k(  sJ |j                  }"t        |      }||z  }|dz  s|dz  s|dz  st        t        |      D ]  }#t        t        |            D ]  }*t        ||*   j                         |      \  }%}&|%|z  }+|&|z  },||+   j                         }-||+dz      j                         }.||#|%|%|z   |&|&|z   f   }'t!        t        |-|.            D ]Q  \  }/}||,|.z  ||,z
  |-z  z   |/z      j                         }t        ||      \  }(})|'| |   ||#|(|(|z   |)|)|z   f   z  z  }'S   n>t        j"                  d|j                  |j
                        }t        | |||||||       |j                  |"      S t%        |      )ad  Scattered matrix multiplication of tensors.

    A scattered matrix multiplication is defined as a series of matrix
    multiplications applied to input tensors according to the input
    and output mappings specified by indices data.

    The following indices data formats are supported for defining a
    scattered matrix multiplication operation (:attr:`indices_data[0]`
    holds the name of the indices data format as specified below):

    - ``"scatter_mm"`` - matrix multiplications scattered in batches
      of tensors.

      If :attr:`blocks` is a :math:`(* 	imes M 	imes K) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor,
      and :attr:`indices = indices_data['indices']` is a :math:`(*
      	imes 3)` tensor, then the operation is equivalent to the
      following code::

        c_offsets, pq = indices_data[1:]
        for r in range(len(c_offsets) - 1):
            for g in range(c_offsets[r], c_offsets[r + 1]):
                p, q = pq[g]
                accumulators[r] += blocks[p] @ others[q]

    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
      batches of tensors and a tensor.

      If :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor, then
      the operation is equivalent to the following code::

        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for i, r in enumerate(r_offsets):
                r0, r1 = divmod(r, N)
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for g in range(c_indices[i], c_indices[i+1]):
                    p = p_offsets[g]
                    q0, q1 = divmod(q_offsets[g], N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
      scattered in batches of tensors and a tensor. A memory and
      processor efficient version of ``"bsr_strided_mm"`` format.  If
      :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor, :attr:`others`
      is a :math:`(* 	imes K 	imes N)` tensor, :attr:`accumulators`
      is a :math:`(* 	imes M 	imes N)` tensor, then the operation is
      equivalent to the following code::

        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for r in r_offsets:
                m = (r // N) // Ms
                n = (r % N) // Ns
                r0, r1 = divmod(r, N)
                c0, c1 = c_indices[m], c_indices[m + 1]
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for i, p in enumerate(range(c0, c1)):
                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                    q0, q1 = divmod(q, N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

      Notice that the order of ``r_offsets`` items can be arbitrary;
      this property enables defining swizzle operators via
      rearrangements of ``r_offsets`` items..

    Auxilary functions are provided for pre-computing
    :attr:`indices_data`. For example,
    :func:`bsr_scatter_mm_indices_data` is used to define indices data
    for matrix multiplication of BSR and strided tensors.

    Parameters
    ----------
    blocks (Tensor): a 3-D tensor of first matrices to be multiplied

    others (Tensor): a tensor of second matrices to be multiplied. If
      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
      tensor of second input matrices to be multiplied. Otherwise, the
      second input matrices are slices of the :attr:`others` tensor.
    indices_data (tuple): a format data that defines the inputs and
      outputs of scattered matrix multiplications.

    Keyword arguments
    -----------------

    accumulators (Tensor, optional): a tensor of matrix product
      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
      is a 1-D batch tensor of output matrices. Otherwise, output
      matrices are slices of the :attr:`accumulators` tensor.
    r   ra   
scatter_mmr   Nr,   r   r:   bsr_strided_mmSPLIT_Nr!   bsr_strided_mm_compressed)r   )r   r#   r   zerosr,   r   _scatter_mm2rc   r   rz   item_scatter_mm6zero_divmodr   r?   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_formatPMsKs	c_offsetspqQKs_NsRMs_Ns_rg0g1ru   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper;   r_r0r1accq0q1jr&   r)   c0c1is0                                                   r   r   r      s7   H "!_N;;!IAr2%$QR(	2{{a\\
3Syy"Q&A ;;2r{&,,v}}]L&,,KAsC"99"997b2gbL,@9??1-12q\q1u%r2Aa5DAq Ovay6!9'<<O ' 3  BE	+	+||6",,1a2v{{;G;K8	9iDy/immo**,q0Q66A ;;'Acr):'AA'Aq'A^d^k^klL &&rs+EAr7N7)// .'\7b2gbL,@ 1Xyq12A"1**,B"1**,B"1q5)..0B#B]FB&q"R"W*bbj'@AC"2r](|Yq\1!'!!4Bvay6!RRZBG2K+LLL + 3  Iy)UY[gh  !344	6	6||6",,1a2v{{0<QR0@-	9iy/immo**,q0Q66A ;;'Acr):'AA'Aq'A^d^k^klL &&rs+EAr7N7)// .'\7b2gbL,@1Xs9~.A#IaL$5$5$7;FBbAbA"1**,B"1q5)..0B&q"R"W*bbj'@AC )%B- 81%a"f!r/A&AA&EFKKM!'1Bvay6!RRZBG2K+LLL !9 /  EIYIYZIIy)UY[gh  !344 ".11r   c           
      \   ||||	|
|hd hk(  r.t         j                  j                         }t        d| ||||f|dt         j                  df      }| |j
                  d$i | |S | ||fdk(  rU||fdk(  rd}d}d}d	}d}
d	}	n||fd
k(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d}d}
d	}	nx| ||fdk(  rU||fdk(  rd}d}d}d}d}
d}	n[||fd
k(  rd}d}d}d	}d}
d}	nF||fdk(  rd	}d}d}d	}d}
d	}	n1||fdk(  r)d}d}d}d	}d}
d	}	n| ||fdk(  rd||fdk(  rd	}d}d}d}d}
d}	n||fd
k(  rd}d}d}d}d}
d}	n||fdk(  rd}d}d}d	}d}
d}	n||fdk(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d}d}
d	}	n| ||fdk(  rd||fdk(  rd	}d}d}d}d}
d}	n||fd
k(  rd	}d}d}d	}d}
d}	n||fdk(  rd	}d}d}d	}d}
d	}	nk||fdk(  rd}d}d}d	}d}
d	}	nW||fdk(  rPd	}d}d}d}d}
d	}	nC| ||fdk(  r;||fdk(  rd}d}d}d}d}
d}	n'||fd
k(  rd}d}d}d}d}
d}	n||fdk(  rd}d}d}d}d}
d	}	|(ddd	ddddddd	j                  |d      }|dk\  r|dk\  rd}||z  }|t        |dk  rdnd|      }|t        |dk  rdnd|      }|
xs d}
|	t        | |      dkD  rddddj                  |d	      }	n`t        | |      dk(  rddddj                  |d	      }	n:t        | |      dk(  rdd	dj                  |d	      }	ndddj                  |d	      }	|xs d	}||k  sJ t        ||             ||k  sJ t        ||             || k  sJ t        | |              ||k  sJ t        ||!             ||k  sJ t        ||"             t        d$||||
|	|d#|S )%Nr   r         ?version)   r   r   )r:   r:   r   r:      )    r   r
   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r:   r   r   r   r   r   r   r       r   r   r   )r:   r   r   )r:   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   r6   )	r   r   get_device_namer   float16updategetrB   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   s                  r   scatter_mm_metar     s   J
CvMjj002q!QB&7!"EMM3 79DKK % K q!9
"Bx8#&2Fja:RS	bX%&2Fja:RS	bX%&2Fja:RS	bZ'&2Fja:RS	AY*$Bx8#&2Fja:RS	bX%&2Fja:RS	bX%&3F!zqJSTbZ'&2Fja:RS	AY+%Bx8#&3F!zqJSTbX%&2Fja:RS	bX%"6BV!zqJSTbZ'"6BV!zqJSTbZ'"6BV!zqJSTAY+%Bx8#&3F!zqJSTbX%&2Fja:RS	bX%&3F!zqJSTbZ'&2Fja:RS	bZ'&2Fja:RS	AY+%Bx8#&3F!zqJSTbX%&2Fja:RS	bX%&3F!zqJST aQQRarQS[]^bbcdfhi9dG	
gB~28RR0~28RR0qJq!9tA1-11"a8IAY$A1-11"a8IAY#A**2q1IA**2q1IqJR<3V33<R<3V33<7$D1$$77$D1$$77$D1$$7 Vvf%GVOTV Vr   c                    |||hd hk(  rYt         j                  j                         }	t        d| ||||f|	dt         j                  df      }
|
 |
j
                  di | |
S |xs d}|xs d}|xs d}t        d|||d|S )	Nbsr_dense_mmr   r   r   r   r   )GROUP_SIZE_ROWr   r   r6   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   s              r   bsr_dense_mm_metar   .  s    :~.4&8jj002Aq"b(9;!"EMM3 79DKK % K#(qNqJQIc~*PYc]bccr   c                 8   |t         j                  }||	|
|hd hk(  rWt         j                  j                         }t	        d| |||||dk(  |dk(  |dk(  f|d|df      }| |j
                  di | |S |xs d}|xs d}|
xs d}
|	xs d}	t        d|||
|	d|S )	Nbsr_dense_addmmr   r   r   r   r   )r   r   r   r   r6   )r   r   r   r   r   r   r   )r   r   r   r   r   betaalphar   r   r   r   r,   r   r   r   s                  r   bsr_dense_addmm_metar  =  s    }J7D6Ajj002)Aq!RTQYPQ	SX\]S]+^#a_>DKK % KlG#(qNqJQIt:ajtnsttr   c                   2    e Zd ZdZd Zd Zd Zed        Zy)TensorAsKeyaS  A light-weight wrapper of a tensor that enables storing tensors as
    keys with efficient memory reference based comparision as an
    approximation to data equality based keys.

    Motivation: the hash value of a torch tensor is tensor instance
    based that does not use data equality and makes the usage of
    tensors as keys less useful. For instance, the result of
    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
    the tensor results from `crow_indices` method call are equal, in
    fact, these share the same data storage.
    On the other hand, for efficient caching of tensors we want to
    avoid calling torch.equal that compares tensors item-wise.

    TensorAsKey offers a compromise in that it guarantees key equality
    of tensors that references data in the same storage in the same
    manner and without accessing underlying data. However, this
    approach does not always guarantee correctness. For instance, for
    a complex tensor ``x``, we have ``TensorAsKey(x) ==
    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
    return False.
    c                 h   d }t        j                  |      | _        |j                  t        j
                  u r ||      | _        n|j                  t        j                  t        j                  hv r2 ||j                                ||j                               f| _        ns|j                  t        j                  t        j                  hv r2 ||j                                ||j                               f| _        nt        |j                        t!        | j                        | _        y )Nc                    | j                   j                  s| j                   j                  rJ | j                          | j                         | j	                         | j
                  | j                         | j                   fS r   )r,   is_floating_point
is_complexdata_ptrstorage_offsetr#   rC   )objs    r   get_tensor_keyz,TensorAsKey.__init__.<locals>.get_tensor_keyh  s]     		33syy7K7KWciiWLLLNC$6$6$8#))SZZ\SVS\S\]]r   )weakrefref_obj_refr   r   stridedkey
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr  r  s      r   __init__zTensorAsKey.__init__f  s    	^  C(::&%c*DHZZE,,e.>.>??&s'7'7'9:N3??K\<]^DHZZE,,e.>.>??&s'7'7'9:N3??K\<]^DH%cjj11$((^
r   c                     | j                   S r   )r  r  s    r   __hash__zTensorAsKey.__hash__  s    zzr   c                     t        |t              sy| j                  |j                  | |u S | j                  |j                  k(  S )NF)
isinstancer  r  r  )r  others     r   __eq__zTensorAsKey.__eq__  sA    %-88uyy0 5= xx599$$r   c                 "    | j                         S )z'Return object if alive, otherwise None.)r  r  s    r   r  zTensorAsKey.obj  s     }}r   N)	__name__
__module____qualname____doc__r  r  r!  propertyr  r6   r   r   r  r  O  s*    ,$4%  r   r  )maxsizec	           	      B   |j                   }	|	J |	j                         |	j                         }}
|
j                  }t        j
                  }| dk(  r*||z  }g }t	        j                  |||      |z  }t        ||z        D ]o  }|
|   j                         }|
|dz      j                         }||k(  r2|j                  ||| ||z  z  j                  |      |j                  ||z
        z          q t	        j                  |      }|
j                         }|j                         }|||z  z  }||z   j                  d      }|
}||   j                  |      }|j!                  dd      \  }}||   }| |||fS | dk(  r||z  }g }g }t	        j                  |||      |z  }t        ||z        D ]  }|
|   j                         }|
|dz      j                         }||k(  r2|j                  t	        j                  ||||      j                  |             |j                  ||| ||z  z  j                  |      |j                  ||z
        z           t	        j                  |      }|
j                         }|j                         }|||z  z  }||z   j                  d      }t	        j                  |
d d t	        j"                  ||   j                  |      d      f      }t	        j                  |      }| ||||fS | d	k(  r|}dg}g }t        |      D ]  }t        ||z        D ]  }|
|   j                         }|
|dz      j                         }t        ||z        D ]l  }|j                  |d   |z   |z
         t        ||z
        D ]?  } || z   }!||!   j                         |||z  z  z   ||z  z  |z   }"|j                  |!|"g       A n   | t	        j$                  |||      t	        j$                  |||      fS t'        d
| d      )Nr   r   r   ro   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r  r   r   r   r   int32arangerc   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr;   r&   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr)   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datarA    sZ   
)
-
-C?? # 0 0 2COO4E+L  FKKM44'\LLfEJqBwAa%%'Ba!e$))+BRx  +b"4Q"?!G!G!PSTSfSfgilngnSo!op   IIm,	(--/088: BF+ULL$	 	'(<=OOPWX*//4/MWg&		9i@@	+	+'\LLfEJqBwAa%%'Ba!e$))+BRx  b"MRX!Y!`!`ah!ij  +b"4Q"?!G!G!PSTSfSfgilngnSo!op   IIm,	(--/088: BF+ULL$	II|BQ/$||,=>R,S,e,efm,npqrt u	IIm,		9iKK	<	'C	
xA17^!!_))+!!a%(--/qBwA$$Yr]R%7"%<="27^F(^002Q!r']BqBwORSS"))1a&1 , ( $ ! YmFKZ}VLN 	N
 3N#44rsttr   c                    | j                         dk(  sJ | j                  dk(  sJ | j                         }| j                         }| j	                         j
                  dd }| j
                  \  }}|\  }	}
|j
                  dd \  }}||k(  sJ |j
                  dd j                         }t        ||||	|
fi |}d|vr<|j                  | j                  t        j                  t        j                  hv        |d   }t        |||||	|
||t        |       	      }|dk(  r|j                  d	
       ||fz   S |dk(  r|j                  d
       ||fz   S |S )zkComputes indices data for :func:`scatter_mm` used in BSR and
    strided tensor matrix multiplication.
    r   r
   r!   N
allow_tf32rC  r   r   T)is_compressedr   F)	dense_dimr   r   r   r   r#   numelr   r   r,   r   r   r.   rA  r  )r   r   r   
meta_inputr   r   r=   r   r   r   r   K_r   r6  r   r   r   s                    r   bsr_scatter_mm_indices_datarJ    sg    ==?a88q==##%L//#K

""23'I99DAqFBKKEB7N7{{3B%%'H1aB9j9D:%syyU]]ENN,KKL9oG/1aR7K<LNL 44$'tg%%	+	+%(tg%%r   c           
      0   | j                   dk(  sJ |j                   dk\  sJ | j                  d   | j                  d   |j                  d   }}}| j                         j                  dd }|t        | |d      }|d   }|@t	        j
                  g |j                  dd ||| j                  | j                        }|j                  }	t        |      }| j                         dk(  r|j                          n|d	v r/|j                          t        | j                         |||
       nT|dk(  rC|j                  dd j                         }
t	        j                  |
|z  |d   z  |z  |d   z  |d   |d   f| j                  | j                        }t        |      j                  dd      j                  |
||d   z  |d   ||d   z  |d         j!                  dd      j#                  dd      }t        | j                         |||
       |j%                  |j'                  d|
||d   z  ||d   z  f      j!                  dd      j)                  |
||      j                  dd             nt+        |      |j                  |	      S )zBSR @ strided -> strided
    r
   r!   ro   Nr   )r   r   r   >   r   r   r   r   r   )ra   r   r   r
   )r   r
   ra   r   )r   r#   r   rJ  r   r   r,   r   r   _nnzr   r   rG  r   r   r   movedimr   copy_	unflattenreshaper   )r   r   r   outr   r   r   r=   r   	out_shaper6  r   r   s                r   bsr_scatter_mmrS    sb    88q==::??2		"u{{2BB

""23'I23Nij!!_N
{kk5EKK,5b5"5SYYszzZ		I
C.C
xxzQ			J	J		3::<3G	<	';;s#))+{{HrMYq\$AB$F)TU,$VXabcXdfopqfr#s),3::GE"9R$4"	!"4ilB)TU,DVXabcXde7<671a=	 	 	3::<LQ		,9Q21+=rYq\?Q RS7<678R,9R$		& ".1188Ir   F)r   r  rQ  skip_checksmax_gridr   inputr   denserQ  rT  rU  r   c                    d}	|j                         }
|j                         }|j                         }|j                         dz
  }|j                  ||dz    \  }}|
j                  |dz   |dz    }|j                  d   }t        |	||      }||j                  |||fz         }|j                         dk(  sdk(  r@dk(  r|j                          |S |j                  |        dk7  r|j                         |S #t        ||||d   |d   |j                        |}t        || ||      \  }}}
} }}|\  j                  d|z        }||z  t        |f      }t        | f      } |}t        |f      }t         j"                  t$        j&                  t         j(                  t$        j&                  t         j&                  t$        j*                  t         j*                  t$        j*                  i|j                      |j-                  d      }|j-                  d      dz
  }|j-                  d	      }|||f}|*t/        |d d d d d         d
dt1        |d d       z
  z  z   }nd }|
d|d|d| d|d|di}dk7  sJ  fd}t3        ||||       |j5                         |j5                         k7  r*|j                  |j7                  |j                               |S )Nr   r   r
   ra   ro   r   r,   r   r   r   r   NNr   Nro   r   r   r   r   Nc                     t        |    g t        | dk(  dk7  dk(  t        j                  k(  d y )Nr   r   )beta_is_onebeta_is_nonzeroalpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLrC  	acc_dtype)_bsr_strided_addmm_kernelr_   tlfloat32)	rt   r   BKBMBNr  r   dot_out_dtyper   s	     r   r}   zbsr_dense_addmm.<locals>.kernel{  sk    !$' 	!>2			 	 AI!$

2#	 	r   )r   r   r   r"   r#   rQ   	new_emptyrL  r   rN  mul_r  r,   r   r   r   r   r   rh  ri  r.   float64r   r0   r?   r   r	  r   )!rV  r   rW  r   r  rQ  rT  rU  r   r   r   r   r   
batch_ndimr   r   r=   r   original_batch_dims_broadcasted
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsrf   rg   rm   r}   rj  rk  rl  rm  s!      ``   `                    @@@@r   r   r   -  s    FZZ\F##%L//#K!!#a'J99Z
Q/DAqZ!^JN;IBA ';63&N#
{oo=AFG
xxzQ%1*19IIK
 
 IIeqy
|#Aq!Yq\9Q<u\_\e\efJ;I#uV[]`;a8L+vueSFBhhy!r'*G	
gBeb"X.Eeb"X.EK
C"b
*C]]BJJ^^RZZ]]BJJ]]BJJ0 14		;M
 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}[O A::  &/9kB
||~,,.. 	))**:*:;<r   IS_BETA_ZEROrc  re  TILE_Krf  rC  c            
         t        j                  d      } t        j                  d      }!||| z  z   ||!z  z   }"t        j                  |"      }#t        j                  |"|z         }$|$|#z
  }%|%dk(  ry t        j                  d|      }&t        j                  d|      }'||| z  z   |	|#z  z   |
|&d d d f   z  z   ||'d d d f   z  z   }(||| z  z   ||#z  z   })||| z  z   ||!z  z   ||&d d d f   z  z   }*||| z  z   ||'d d d f   z  z   }+t        j                  d|      },t	        |%      D ]0  }-t        j
                  ||f|      }.t        j                  |)      }/t	        d||      D ]  }0|0|,z   }1|1|k  }2t        j                  |*||1d d d f   z  z   |2d d d f   d      }3t        j                  |+||/z  z   ||1d d d f   z  z   |2d d d f   d      }4|.t        j                  |3|4||      z  }. |r|.| z  }.n| |.z  |t        j                  |(      z  z   }.t        j                  |(|.j                  |j                  j                               |(|	z  }(|)|z  })3 y )Nr   axisr   rY          maskr   rC  	out_dtype)rh  
program_idloadr-  rc   r   dotstoretor,   
element_ty)5r  r   rx  rc  re  kry  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_striderf  rC  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr    s   F MMq)	1- ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	 &23 :-. 	 )+,#m34 $&6q$w&??@ 	 )+,#&6tQw&??@ 	 		!V,wA-!?yQI  12I1f-"]2	"QWW#+ia.@@Aa
  WW#+i78+i4.@@A  4	
 RVVJ
z]fgg	# .& U"	!I-rww?P7Q0QQ	 HH&	Z5E5E5P5P(QR !22!33G  r   r   c                    t        j                  d      }t        j                  d      }t        j                  d      }t        j                  d      }t        j                  d      } t        j                  |||| |      \  }}|||z  z   ||z  z   }!t        j                  |!      }"t        j                  |!|z         }#|#|"z
  }$|$dk(  ry t        j
                  d|      }%t        j
                  d|      }&| ||z  z   ||"z  z   ||%d d d f   z  z   ||&d d d f   z  z   }'|||z  z   ||z  z   ||&d d d f   z  z   ||%d d d f   z  z   }(|||z  z   ||z  z   ||z  z   ||%d d d f   z  z   ||%d d d f   z  z   })||	|z  z   |
|"z  z   }*t        j                  ||f|      }+t        |$      D ]m  },t        j                  |'      }-t        j                  |*      }.t        j                  |(||.z  z         }/|+t        j                  |-|/||      z  }+|'|z  }'|*|
z  }*o t        j                  |)|+j                  |j                  j                               y Nr
   r{  r   r   rY  r  )rh  r  num_programs	swizzle2dr  r-  r   rc   r  r  r  r,   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_striderc  re  rf  rC  r   r  r  col_block_pidrv  rw  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr    s   \ MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	  9,-$}45 %'74'@@A %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)TwA77#45L GG$56M''"25Km5["[\K |[Zcl mm !22!33    	.11*2B2B2M2MNOr   c           
          |j                  d      }|j                  d      dz
  }	|j                  d      }
||
|	f}|*t        |d d d d d         ddt        |d d       z
  z  z   }nd }|d|d|d|d	|d
i}|j                  t        j
                  t        j                  fv rt        j                  dnt        j                  d fd}t        ||||       y )Nr   ro   r   r   ra   r   rZ  r[  r^  r\  TFc                 >    t        |    g t        | d y )N)rf  rC  )r  r_   )rt   r   rf  rC  r=   r   s     r   r}   z*_run_dense_rowspace_kernel.<locals>.kernel  s<    .t4 %~6 $%	
 r   r   r0   r?   r,   r   r-   r.   rh  ri  rp  r   )r=   r   r   r   rW  outputrU  r   ru  rv  rw  rf   rg   rm   r}   rf  rC  s   `      `       @@r   _run_dense_rowspace_kernelr    s     JJqM	#((,q0zz"~l;	!TrT 23gSRTSTEVAV6WWKKO-=K
 <<EJJ77

IJ

IJ	 	foy+Fr   c           
          |j                  d      }|j                  d      dz
  }||f}|*t        |d d d d d         ddt        |d d       z
  z  z   }nd }|d|d|d|	d|
di}|j                  t        j
                  t        j                  fv rt        j                  d	nt        j                  d
 fd}t        ||||       y )Nr   ro   r   r
   r   )r   N)r   ro   )r   r]  TFc                 L    t        |    g	t        | ddd y )Nr   r   )rf  rC  r   r   )r  r_   )
rt   r   rf  rC  r  r   r=   is_beta_zeror  tile_ks
     r   r}   z)_run_sampled_addmm_kernel.<locals>.kernel  sQ    !$'t\% &~6 $%r   r  )r  r   r  r=   r  r  r   r   r   mat1mat2rU  ru  rv  rf   rg   rm   r}   rf  rC  s   ``````            @@r   _run_sampled_addmm_kernelr    s     KKN	#((,q0-	!TrT 23gSRTSTEVAV6WWKKI'')
 <<EJJ77

IJ

IJ		 		 	foy+Fr   g      ?)r   r  rQ  rT  rU  r  r  c                   d}t        ||        t        || ||      }	|st        ||| j                         t        ||| j                         |dk7  r.| j                  t
        j                  u rt        d| d| d       | j                  t
        j                  ur/t        ||| j                         t        ||| j                         nt        |||j                         t        |||       |t        ||       t        |||j                         t        ||| j                         t        |j                  |	j                  k(  xr! |j                         | j                         k(  | d|	j                   d|	j                          d|j                   d	|j                          	       ||	j                  |j                  d
      }n|j                  |	       |j                         dk(  s|j                         dk(  r|S |j                         j                  dd  }
|j!                  d      }|j!                  d      }|j!                  d      }|dk(  s|dk(  r!|j                         j#                  |       |S |}t%        |||      \  }}}}}t'        ||
d   |f      }t'        |||
d   f      }t)        |
 }t+        |||dk(  |
||||||||       |j                         j-                         dd  |j-                         dd  k7  rF|j                         j                  |j/                  |j                         j                               |S )Nsampled_addmmr}  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r!   ro   r   r   )r   r   r   r   r,   r   boolr   r2   r*   r#   rL  r  rN  rG  r   r   ro  r   r   rz   r  rC   rP  )rV  r  r  r   r  rQ  rT  rU  r   input_broadcastedr=   r&   r)   r  rs  r   r   r   r  s                      r   r  r    s
    !'4VUD$Ou||4u||4s{u{{ejj8h1$7jk {{%**,FD%++6FD%++6FD$**5&vtT: -VS$++6FC5II!2!8!88 3
ejjl2h?@Q@W@W?X Y-->-C-C-E,F G++.99+_SXXZLR ;#&&tzz&=CII'(99;!sxxzQJJJL&&rs+	IIbMIIbMIIbM C<16JJLd#J 
8FsDRV8W5k64 	!a'89 9Q<'89i!4q&L+$	
 %%',0DD%%fnnZ5F5F5H5N5N&OPr   T)rQ  rT  rU  r   enable_bsr_scatter_mmr  c          
      4   d}| j                   dd  \  }}	|st        ||        t        || |j                         t	        || |j
                         t        || |       |j                  d      }
| j                         j                   dd  \  }}t        |
|z   d|
 d| d       t        |||f       n|j                   dd  \  }}
t        || |      }|o|sm|||
fz   }t        |j                   |k(  d| d|j                    d       t        |j                         xs  |j                  dd      j                         d	       ||j                  |||
fz         }| j                         d
k(  r|j!                         S | j                         j                   dd  }|rt#        |      dk(  r| j%                         d
k(  r| j&                  dk(  r| j
                  }|t(        j*                  t(        j,                  hv r|dk\  r|
dk\  s;|dk(  r|
dk\  s1|
dk\  s,|t(        j.                  k(  r'|dk\  s|dk(  r|
dk\  s
|dk(  r|
dk\  rt1        | ||      S |t3        ||	|
|d
   |d         }nt3        ||	|
|d
   |d   fi |}|}t5        | ||      \  }}}}}t7        ||d d d         }|}t7        ||d
   |d
   f      }t9        ||||||||       |j;                         |j;                         k7  r*|j=                  |j?                  |j                                |S )Nr   r!   ro   z"bsr_dense_mm(): dense.size(-1) == z( should be divisible by blocksize[0] == r    z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r:   r
   r   r   r   i      r   r   r   )rQ  r   ) r#   r   r   r   r2   r,   r*   r   r   r   r@   rQ   is_contiguousr   rn  rL  r   rz   rF  r   r   r   r.   ri  rS  r   r   r   r  r	  rN  r   )r   rW  rQ  rT  rU  r   r  r   r&   r'   r)   	row_blockr  r(   rr  expected_out_shaper=   r,   rs  r   r   r   rt  s                          r   r   r   7  sE     		"#2VS)ell3U[[1&vsE:

2A#&::<#5#5bc#: Iy	M!4QC 8##,+Q0
 FY	$:;KK$EB*>vsE*R'?;!@Aq6!I		//./z#))AG
 !!#Ls}}R'<'J'J'L" ;//"AQF"JKC 88:?99;JJL&&rs+	 S^r%9cmmoQR>RWZW_W_cdWdIIE u}}enn==9d4iAJ6kemm+t)3h183h19%c5c::<$QAy|Yq\JD$QAy|Yq\RTRD 
 9GsESV8W5k65# "%4R49 ilIaL%AB 	#9flKQVX[]egkl<<>Z0022 [--j.>.>?@r   MAX_ROW_NNZTILEc                    t        j                  d      }t        j                  d      }t        j                  d      }| ||z  z   ||z  z   }t        j                  |      }t        j                  ||z         }||z
  }|dk(  ry t        j                  d|
      }|||z  k  }|||z  z   ||z  z   ||z  z   }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  |d      }t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  |d      }t        j                  ||kD  ||      } t        j                  ||z
        }t        j                  |d      }t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  ||z
        }|t        j                  |d      z  } t        j                  ||z   ||z  j                  |j                  j                        |       t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  ||z
        }t        j                  ||z   ||z  j                  |j                  j                        |        y )Nr
   r{  r   r   infr~  )r  )rh  r  r  r-  r/   r  ri  rz   rc   whereexpsumr  r,   r  )r  r  r  r  r  r  values_nnz_col_block_strider  r  r  r  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr    s7    MMq)	!}}!41- ')34!M12 	 
 WW45
''"9<O"OP "J.a<YYq$'
Gi// !I-.%(<<= 9$% 	 77/*<4PUV[P\}]``acakaklxa0t[$/A$J) 33Dww3j@tTYZ_T`S`addegeoeopH!#q!9HH]5G%GXjkM 0 ffX-.s#t[$/A$J) 33Dww3j@tTYZ_T`S`addegeoeopH&&M12CRVVCa((E 0 	%
2S5[4D4DZEUEUE`E`4ahlmt[$/A$J) 33Dww3j@tTYZ_T`S`addegeoeopH&&M12CHH)J6u8H8HIYIYIdId8elpq 0r   c                 *   d}t        ||        t        || | j                         | j                         dk(  s| j	                         dk(  r| j                         S | j                  dd  \  }}| j                         }| j                         j                  dd  \  t        j                  |      nt        j                        | j                         j                  d      j                  dd      }| j                         j                  dd      j                         r| j                         j                         }n| j                         }|j                  dd      j                         j                  d      j                  dd      j!                  d|z        }|j                  d   |z  f}d }	|dd df   d|d	i}
fd
}t#        ||
||	        |j!                  d|      j                  dd      j                   | j                         j                   }t%        j&                  | j                         j                         | j)                         j                         || j                  | j*                        S )Nbsr_softmaxr   r!   r   r]  ro   .r[  rZ  c                 R    t        |    g t        | t        d        y )Nr  )r  r_   rB   )rt   r   r  max_row_nnzr  s     r   r}   zbsr_softmax.<locals>.kernel  sH    % %~6$ 
 G[)r   r   )r   r2   r,   rL  rG  cloner#   r   tritonnext_power_of_2r   r   r   r   r  rD   rP  r   r   r   r   r   )rV  r  r   r&   r)   nnzr   r   rf   rg   rm   r}   r  r  s    `          @@r   r  r    s:   'FE5;;/::<1 2;;= {{231jjl$||~33BC8	9 003K 00=K))+55a8@@BG <<>##B+99;\\^))+F\\^F!!"b)446@@CKKArRZZ[]_hjmpyjyz\\!_ii@	 crc"MO	
	 	foy+FXIsI>HHRPXXZ_ZfZfZhZnZno-- &&(%%'<<
 	
r   queryr  value	attn_mask	dropout_p	is_causalscalec           	          d}t        | | d       t        |d u| d       |J t        |j                  t        j                  k(  | dt        j                   d|j                   d       t	        ||| j
                         t	        ||| j
                         t	        ||| j
                         t        ||| j                         t        ||| j                         |j                  t        j                  urt        ||| j                         t        || |j                  dd      d	d
      }|| j                  d      dk(  s|d	k(  rt        d
| d| d       |'dt        j                  | j                  d            z  n|}	|j                         j                  |	       t!        |      }t        j"                  j$                  j'                  |j                         |d       t)        ||      }|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r    r!   ro   r}  F)r   rT  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   r   r   r   r   r   r2   r,   r  r  r   r   mathsqrtr   ro  r  nn
functionaldropoutr   )
r  r  r  r   r  r  r  r   sdpascale_factors
             r   r  r  %  s    1Mh=>	
 	T!h=>	
 $$$ 0 00h ((-(8(8'9 :##,#3#3"4A7	
 	VS%,,/VUELL1VY5FC-FE5;;/??%**,	5;;7Ys}}R/D3\ab=UZZ^q0ESL(8 @/ /
 9>q499UZZ^445<(4 ##DKKMY#MD%(r   r   r   r   rm  r   r   c                 .   | |z  }||z  }t        j                  d      }t        j                  d      }||z  }||z  }||z  t        j                  d|      z   }||z  t        j                  d|      z   }t        j                  d|      } ||d d d f   |z  | d d d f   |z  z   z   }!|| d d d f   |	z  |d d d f   |
z  z   z   }"t        j                  |||z  z         }#t        j                  ||dz   |z  z         }$|#|$k(  ry t        j                  ||f|      }%t        |#|$      D ]  }&t        j                  ||&|z  z         }'t        j                  ||&|z  z   |z         }(t        j                  |!|'|z  z         })t        j                  |"|(|z  z         }*|%t        j                  |)|*||      z  }% |||z  z   |d d d f   |z  |d d d f   |z  z   z   }+t        j                  |+|%j                  |j                  j                               y Nr   r{  r   rY  )r  rC  )rh  r  r-  r  r   rc   r  r  r  r,   r  ),r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1rm  r   r   rC  r   r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs,                                               r   _scatter_mm2_kernelr,  W  s    &[&[1%mm#r	bfnryyF33fnryyF33YYq!_bDkO;bqkO>[[\bDkO;bqkO>[[\WW^e.?&??@WW^uqy4E&EEF8HHff-]C	r2A[01A[0;>?AO 334AO 334A1*UUI  !5+@#@@q$wK//"T1W+@U2UUW
	%5%;%;%F%FGHr   r   r   r@  
pq_indicesr   c                    | j                   \  }}|j                   \  }}|j                   \  }	}}t        t        ddz        t        ddz        dd      }
fd}t        j                  t
        j                  t        j                  t
        j                  t        j                  t
        j                  t        j                  t
        j                  i|j                     }d|
vr#|
j                  |t
        j                  k(         t        |   || | j                  d	      | j                  d      | j                  d      ||j                  d	      |j                  d      |j                  d      ||j                  d	      |j                  d      |j                  d      j                  d	      ||j                  d	      |j                  d      fd
|i|
 y )Nr:   r   r   r
   )r   r   r   r   c                     j                   d   dz
  t        j                  | d         t        j                  | d         z  dfS )Nr   r   r   r   r#   r  cdiv)METAr   r   r@  s    r   rt   z_scatter_mm2.<locals>.grid  sH    $$Q'!+V[[DN-KfkkZ[]abj]kNl-lnoppr   rC  rD  r   rm  )r#   r   rz   r   r   rh  ri  r.   rp  r,   r   r,  rC   )r   r   r@  r-  r   r   r   r   r  r   r   rt   rm  r   r   s     `          @@r   r   r     s    ,,1a,,1a$$1a3r16?3r16?q\]^	q 





4 5A4F4FH t#KK=BJJ#>K?D!q!FMM!$fmmA&6a8HFMM!$fmmA&6a8H,--a0,2E2Ea2H,J]J]^_J`
))!,
))!,j.?.?.B		
 (		
 		
r   r   rE  r   r   c                    ||z  }||z  }||z  }t        j                  d      }t        j                  d      }|| z  } || z  }!||z  }"||"z  }#|#|z  }$t        ||$z
  |      }%|$||%z  z   }&||"z  |%z  }'|&|z  t        j                  d|      z   }(|'|z  t        j                  d|      z   })t        j                  d|      }*||(d d d f   |z  |*d d d f   |z  z   z   }+|| |	z  z   |*d d d f   |
z  |)d d d f   |z  z   z   },t        j                  ||!z         }-|rW|-|z  |z  }.|-|z  |z  }/t        j                  ||.z         }0t        j                  ||.z   dz         }1|/|1z  ||/z
  |0z  z   }2|1|0z
  }3n8t        j                  ||!z         }2t        j                  ||!z   dz         }4|4|2z
  }3||2z   }5t        j
                  ||f|      }6|r|+0|z  z  }+t        |3      D ]j  }7t        j                  |5      }8t        j                  |,|8z         }9t        j                  |+      }:|6t        j                  |:|9||      z  }6|+|z  }+|5dz  }5l n||2z   };t        |3      D ]  }7t        j                  |5      }8t        j                  |,|8z         }9t        j                  |;      }<t        j                  |+|<|z  z         }:|;dz  };|5dz  }5|6t        j                  |:|9||      z  }6 ||-z   | |z  z   |(d d d f   |z  |)d d d f   |z  z   z   }=t        j                  |=|6j                  |j                  j                               y r  )rh  r  rB   r-  r  r   rc   r  r  r  r,   r  )>r6  r   r   r   r  r  r  r  r  others_stride_Br  r  r  accumulators_stride_Br  r  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrrE  rm  r   r   r   r   rC  r   BLOCKS_MBLOCKS_Npid_t_r"  pid_br!  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mr#  r$  r%  r&  r'  r(  r)  r   r&   r)   r   r   r   r  r   q_ptrr  r  r   r   r*  p_ptrr   r+  s>                                                                 r   _scatter_mm6_kernelrD    sy    '\<<A&mm#!("%0**+8k1:>s\12''L8fnryyF33fnryyF33YYq"bDkO;bqkO>[[\U_441d7o8UXZ[_ab[bXcfuXu8uv GGME)*aBAQ2A*+B*Q./BR7Q;",,Br'C./B.23Br'C"HHff-]C	R/))E3ZGGENGGEAI&GGENRVVAqMjYY	(
   "B&E3ZGGENGGEAI&GGENGGEA$778

RVVAqMjYY	   !1$u/D'DDq$wK//"T1W+@U2UUW
	%5%;%;%F%FGHr   r   r   r   r   force_contiguousc	                    |d   }	| j                   \  }
}|j                   \  }}|j                   \  }}}||k(  sJ ||	z  |k(  sJ fd}t        j                  t        j                  t        j
                  t        j                  t        j                  t        j                  t        j                  t        j                  i|j                     }d|vr#|j                  |t        j                  k(         |j                  d      dk(  sJ j                  d      dk(  sJ |j                  d      dk(  sJ |j                  d      dk(  sJ |rD| j                         } |j                         }|j                         s|j                         }n|}n|}t        |   ||| | j                  d      | j                  d      | j                  d      ||j                  d      |j                  d      |j                  d      ||j                  d      |j                  d      |j                  d      |||fd|i| |r#|j                         s|j                  |       y y y )	Nr   c                     j                   d   z  t        j                  | d         t        j                  | d         z  fS )Nr   r   r   r0  )r2  r   r   r   r   s    r   rt   z_scatter_mm6.<locals>.grid  sC    OOA&*FKKDN,KfkkZ\^bck^lNm,mnnr   rC  rD  r   r   r
   rm  )r#   r   r   rh  ri  r.   rp  r,   r   rC   rD   r  rD  rN  )r   r   r   r   r   r   r   r   rE  r   r   r   rI  r   B_r   r   rt   rm  accumulators_r   r   r   s      `                @@@r   r   r     sK    y/LL	2r<<2q &&	ArQww'\Qww	o 





4 5A4F4FH t#KK=BJJ#>K?"a'''"a'''"a'''"a''' &&(F&&(F--/ , 7 7 9 ,(MD!r2qFMM!$fmmA&6a8HFMM!$fmmA&6a8H=//2M4H4H4K]MaMabcMd	
 (	
 	
 L$>$>$@}- %Ar   r`  ra  rb  rd  c)                    t        j                  d      })t        j                  d      }*t        j                  d      }+t        j                  d      },t        j                  d      }-t        j                  |*|+|,|-|'      \  }*}+|||)z  z   ||*z  z   }.t        j                  |.      }/t        j                  |.|z         }0|0|/z
  }1t        j
                  d|"      }2t        j
                  d|$      }3t        j
                  d|#      }4|||)z  z   ||*z  z   ||+z  z   ||2d d d f   z  z   ||4d d d f   z  z   }5| ||)z  z   ||/z  z   ||2d d d f   z  z   ||3d d d f   z  z   }6|||)z  z   ||+z  z   ||3d d d f   z  z   ||4d d d f   z  z   }7|||)z  z   ||*z  z   ||+z  z   ||2d d d f   z  z   ||4d d d f   z  z   }8||	|)z  z   |
|/z  z   }9| r3t        j                  |5      j                  |%      }:|r|!s$||z  };|:|;z  }:nt        j                  |"|#f|%      }:t        |1      D ]m  }<t        j                  |6      }=t        j                  |9      }>t        j                  |7||>z  z         }?|:t        j                  |=|?|&|%      z  }:|6|z  }6|9|
z  }9o |!s|:|z  }:t        j                  |8|:j                  |j                  j                               y r  )rh  r  r  r  r  r-  r  r   rc   r  r  r,   r  )@r  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  r  r  r  r  r  r  r   r  r`  ra  rb  rc  re  rd  rf  rC  r   r   r  r  r  rv  rw  r  r  r  r  r  inner_block_aranger  
input_ptrsr  r  r  r  r  
beta_alphar  r  r  r  s@                                                                   r   rg  rg  C  s   r MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:99Q6  9,-$}45 %}45 %'74'@@	A
 %'7a'@@A 	 !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 !wwz255i@L!E\
 J. !xx(FiXwA77#45L GG$56M''"25Km5["[\K |[Zcl mm !22!33   % 	.11*2B2B2M2MNOr   r   )NNNNNN)NNN)NNNNN)r   )NN)r}  FN)T)Br  osr   r  	functoolsr   torch.utils._tritonr   _triton_ops_metar   typingr   r   intgetenvr	   r   r   r   r*   r2   r@   rE   rQ   rV   r]   r_   rv   r   r   r   r   r   r   r   r   r  r  rA  rJ  rS  Tensorr  r   r   r  triton.languagelanguagerh  jit	constexprr  r  r  r  r  r   r  r  r/   r  r,  r   rD  r   rg  r6   r   r   <module>r`     sH    	    * & "-0;gij1k-l *
"(&U;.&,7Dk
/ >B S2n imaVJ GKd  dhu$@ @F =>Hu ?HuV@,h &*!QU#c||c\\c ||c ell#c c 5#x}!LMNc tncL < ZZx4 llx4 ||	x4
 ||x4 x4> <<?x4@ LLAx4 x4t ZZ}PN ||O}PP ||Q}PR <<S}PT LLU}PV W}P }P@"GJ(G` &*!QUQ||QllQ llQ ell#Q Q 5#x}!LMNQp '+!QU#&*h\\h||h ell#	h
 h 5#x}!LMNh tnh  $hV ZZCr \\Cr llCr CrL:
B !%0||0\\0 ||0 ELL)	0
 0 0 0d ZZ-I||-I "-I13-I <<-I LL-I LL-I -I -I^
LL
LL
 
 	

  ,,
B ZZOI llOI <<OI <<OI \\OI LLOI LLOI OI OI OIt &*G.LLG.LLG. ||G. ||	G.
 ||G. ||G. G.  ,,G. #G.R ZZ[PX \\Y[PZ [[P\ ll][P^ ||_[P` ||a[Pb c[Pd <<e[Pf LLg[Ph i[Pj k[P [P~ KLM$(!LL $r   