
    Ph                   B   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ ddl m!Z! d	d
l"m#Z#m$Z$m%Z% d	dl&m'Z'm(Z(m)Z) d	dl*m+Z+m,Z, d	dl$m-Z-m.Z.m/Z/ d	dl0m1Z1 d	dl%m2Z2m3Z3 d	dl4m5Z5 d	dl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d	dlCmDZDmEZE d	dlFmGZG ddlHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZR ddlSmTZTmUZUmVZV  ej                  eX      ZYej                  j                  eXd      Z\ej                  j                  eXd      Z]ej                  j                  eXd      Z^ G d deP      Z_ e_       j                  Za eP       j                  Zbd Zcd Zdd Ze G d d eI      Zf G d! d"eO      Zgej                   G d# d$             Zi G d% d&ei      Zj G d' d(ei      Zk G d) d*eN      Zl G d+ d,e2      Zmej                   G d- d.             Zn G d/ d0      Zo G d1 d2      Zp G d3 d4eq      Zry)5    )annotationsN)	AnyCounterDictIterableListOptionalSetTupleUnion)is_integer_dtype)FloorDivModularIndexing)ValueRanges   )counters   )configir	scheduler)	code_hashget_pathPyCodeCache)	MemoryDepStarDep)IRNodeReductionHintTritonTemplateBuffer)!indexing_dtype_strength_reduction)BaseScheduling	WhyNoFuse)AutotuneHint)do_benchget_fused_kernel_nameget_kernel_metadata
green_textis_welford_reductionnext_power_of_2Placeholdersympy_product
sympy_subssympy_symboluniqueyellow_text)opsV)"get_kernel_category_by_source_code   )
CSEVariableDeferredLinefree_symbol_startswithIndentedBufferindex_prevent_reorderingKernelOpOverridesPythonPrinterSizeArg	TensorArg)	config_ofsignature_ofsignature_to_meta
perf_hintsschedulefusionc                  0    e Zd Zd Zd Zd Zd Zd Zd Zy)TritonPrinterc                    t        |j                        dk(  sJ d| j                  | j                  |j                  d                dS )Nr2   tl.math.floor(r   ))lenargsparen_printselfexprs     iC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/codegen/triton.py_print_floorzTritonPrinter._print_floorE   sB    499~"""

4;;tyy|+D EFaHH    c                J    d| j                  | j                  |             dS )Ntl.math.sqrt(z.to(tl.float32)))rJ   rK   rL   s     rO   _helper_sqrtzTritonPrinter._helper_sqrtI   s%    tzz$++d*;<==MNNrQ   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   r2   r   	tl.where(, rG   )doprintrI   )rM   rN   cpqs        rO   _print_WherezTritonPrinter._print_WhereL   s_    LL1&LL1&LL1&1#Rs"QCq))rQ   c                   t        |j                        }t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr2   r   r   ztl.math.min(rW   rG   )rH   rI   rK   sympyMinrM   rN   nargsmidabs         rO   
_print_MinzTritonPrinter._print_MinR       DIItyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?34aS1#Q''rQ   c                   t        |j                        }t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr2   r   r   ztl.math.max(rW   rG   )rH   rI   rK   r^   Maxr`   s         rO   
_print_MaxzTritonPrinter._print_Max\   rf   rQ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr2   tl.abs(r   rG   )rH   rI   rK   rL   s     rO   
_print_AbszTritonPrinter._print_Absf   s9    499~"""TYYq\23155rQ   N)	__name__
__module____qualname__rP   rT   r\   re   ri   rl    rQ   rO   rD   rD   D   s"    IO*((6rQ   rD   c                    t        |       j                  d      d   }|dk(  rd}n|dv rd}n|dk(  rd}n|d	k(  rd
}d| S )N.boolint1)float16bfloat16float32float8_e4m3fn
float8e4nvfloat8_e5m2float8e5ztl.)strsplit)dtypetriton_type_names     rO   triton_compute_typer   o   sd    5z'',R06!!	4	4$	_	,'	]	*%!"##rQ   c                ~    t        |       r(| j                  r| t        j                  k(  rdnd}d| S t	        |       S )N@       ztl.int)r   	is_signedtorchint64r   )r   nbitss     rO   triton_acc_typer   }   s:    5??u{{*wu%%rQ   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    rO   triton_constantr      s9    e	%-		E	;rQ   c                  &     e Zd Zd fdZd Z xZS )TritonCSEVariablec                D    t         |   ||       t               | _        y N)super__init__set	mask_vars)rM   namebounds	__class__s      rO   r   zTritonCSEVariable.__init__   s    v&#&5rQ   c                <   |dk(  ry |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sT|j                  d   dv sf| j                  j                  |j                  d    dh        y )Nwherer   xyrmask)
isinstancer   r   updater^   Symbolr   )rM   r   rI   kwargsargs        rO   update_on_argsz TritonCSEVariable.update_on_args   s|    
 7?C#01%%cmm4C.388A;%3G %%#((1+d';&<= rQ   )r   r   )rm   rn   ro   r   r   __classcell__r   s   @rO   r   r      s    )
>rQ   r   c                     e Zd ZdZedKdLd       ZedMd       Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAed?        ZBed@        ZCedA        ZDedB        ZEedC        ZFedD        ZGedE        ZHedF        ZIedG        ZJedH        ZKedI        ZLedJ        ZMy)NTritonOverrideszMap element-wise ops to TritonNc                   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r|  dS |  dt        |       dS )Nc                   | |k(  ryt         j                  t         j                  h}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!   r   )r   ry   r{   )	src_dtype	dst_dtype
fp8_dtypess      rO   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread   s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3SrQ   (z != 0)z.to(tl.int8).to(tl.uint8).to(rG   )r   torch.dtyper   r   returnint)maxr0   kernelmin_elem_per_threadr   rt   uint8r   )xr   r   r   s       rO   to_dtypezTritonOverrides.to_dtype   s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122D,U34A66rQ   c                $    |  dt        |       dS )Nr   z, bitcast=True))r   )r   r   s     rO   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast   s    D,U34ODDrQ   c                p   |t         j                  k(  r2| j                  |t         j                        }| j	                  ||      S t         j
                  j                  |      }t         ||            }t        |      }|dk(  r|S t        j                  j                         }dg|z  }d| d| d| dS )Nz
tl.float32r2   tl.full(rW   rG   )r   r   constantint16r   _prims_commondtype_to_typer   r   r0   r   triton_tensor_ndim)	clsr   r   tmptype_
triton_valtriton_typendimshapes	            rO   r   zTritonOverrides.constant   s    EKK ,,uekk2C<<U++##11%8$U5\2
)%0,& xx**,d
%:,bQ??rQ   c                    d|  dS )Nrk   rG   rp   r   s    rO   abszTritonOverrides.abs       1~rQ   c                    d|  dS )Nztl.math.abs(rG   rp   r   s    rO   libdevice_abszTritonOverrides.libdevice_abs       aS""rQ   c                    d|  dS )Nztl.exp(rG   rp   r   s    rO   expzTritonOverrides.exp   r   rQ   c                    d|  dS )Nztl.math.exp(rG   rp   r   s    rO   libdevice_expzTritonOverrides.libdevice_exp   r   rQ   c                    d|  dS )Nztl.math.exp2(rG   rp   r   s    rO   exp2zTritonOverrides.exp2      qc##rQ   c                    d|  dS )Nztl.math.expm1(rG   rp   r   s    rO   expm1zTritonOverrides.expm1      s!$$rQ   c                    d|  dS )Nztl.sqrt(rG   rp   r   s    rO   sqrtzTritonOverrides.sqrt
  s    !ArQ   c                    d|  dS )NrS   rG   rp   r   s    rO   libdevice_sqrtzTritonOverrides.libdevice_sqrt  r   rQ   c                    t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |t        j                  d	|       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", rG   accuracyz + 10z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )r   tritoninject_relu_bug_TESTING_ONLYr/   maximumAssertionError)r   bugs     rO   reluzTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;sA&& LSGT rQ   c                    d|  d| dS )Nztriton_helpers.minimum(rW   rG   rp   rc   rd   s     rO   minimumzTritonOverrides.minimum$      (2aS22rQ   c                    d|  d| dS )Nztriton_helpers.maximum(rW   rG   rp   r   s     rO   r   zTritonOverrides.maximum(  r   rQ   c                    d|  d| d| dS NrV   rW   rG   rp   )rc   rd   rY   s      rO   r   zTritonOverrides.where,  s    1#Rs"QCq))rQ   c                    d|  dS )Nztl.cos(rG   rp   r   s    rO   coszTritonOverrides.cos0  r   rQ   c                    d|  dS )Nztl.math.cos(rG   rp   r   s    rO   libdevice_coszTritonOverrides.libdevice_cos4  r   rQ   c                    d|  dS )Nztl.sin(rG   rp   r   s    rO   sinzTritonOverrides.sin8  r   rQ   c                    d|  dS )Nztl.math.sin(rG   rp   r   s    rO   libdevice_sinzTritonOverrides.libdevice_sin<  r   rQ   c                   t         j                  j                  |      \  }}}}t         j                  j                  j	                  t         j                  j
                  |      }|t        j                  t        j                  hvrRt         j                  j                  j	                  t         j                  j
                  | j                  ||            }||_
        |S r   )r0   r   indexingcsegeneratecomputer   int32r   r   r   )r   rN   r   	index_strr   r   
expand_strvars           rO   
index_exprzTritonOverrides.index_expr@  s    121B1B41H.	9dJhhll##AHH$4$4i@ekk22((,,''(8(8#,,sE:RSC!
rQ   c           
     P   t         j                  j                  |       5 } |       }d d d        t         j                  j                  j	                  t         j                  j
                  d dt        |       d| d      }t        j                  ||      S # 1 sw Y   uxY w)Nr   z.shape, rW   .dtype))	r0   r   
mask_loadsr   r   r   r   r/   r   )r   bodyothernew_maskresults        rO   maskedzTritonOverrides.maskedK  s    XX  &(VF ' %%HHvhhu'=&>bP
 yy6511 '&s   BB%c                    d|  dS )Nztl.math.lgamma(rG   rp   r   s    rO   lgammazTritonOverrides.lgammaW       1%%rQ   c                    d|  dS )Nztl.math.erf(rG   rp   r   s    rO   erfzTritonOverrides.erf[  r   rQ   c                    d|  dS )Nztl.math.cosh(rG   rp   r   s    rO   coshzTritonOverrides.cosh_  r   rQ   c                    d|  dS )Nztl.math.sinh(rG   rp   r   s    rO   sinhzTritonOverrides.sinhc  r   rQ   c                    d|  dS )Nztl.math.acos(rG   rp   r   s    rO   acoszTritonOverrides.acosg  r   rQ   c                    d|  dS )Nztl.math.acosh(rG   rp   r   s    rO   acoshzTritonOverrides.acoshk  r   rQ   c                    d|  dS )Nztl.math.asin(rG   rp   r   s    rO   asinzTritonOverrides.asino  r   rQ   c                    d|  dS )Nztl.math.asinh(rG   rp   r   s    rO   asinhzTritonOverrides.asinhs  r   rQ   c                    d|  d| dS )Nztl.math.atan2(rW   rG   rp   r   ys     rO   atan2zTritonOverrides.atan2w      s"QCq))rQ   c                    d|  dS )Nztl.math.atan(rG   rp   r   s    rO   atanzTritonOverrides.atan{  r   rQ   c                    d|  dS )Nztl.math.atanh(rG   rp   r   s    rO   atanhzTritonOverrides.atanh  r   rQ   c                    d|  d| dS )Nztl.math.copysign(rW   rG   rp   r   s     rO   copysignzTritonOverrides.copysign  s    "1#Rs!,,rQ   c                    d|  dS )Nztl.math.erfc(rG   rp   r   s    rO   erfczTritonOverrides.erfc  r   rQ   c                    d|  dS )Nztl.math.erfinv(rG   rp   r   s    rO   erfinvzTritonOverrides.erfinv  r  rQ   c                    d|  d| dS )Nztl.math.hypot(rW   rG   rp   r   s     rO   hypotzTritonOverrides.hypot  r#  rQ   c                    d|  dS )Nztl.math.log10(rG   rp   r   s    rO   log10zTritonOverrides.log10  r   rQ   c                    d|  d| dS )Nztl.math.nextafter(rW   rG   rp   r   s     rO   	nextafterzTritonOverrides.nextafter  s    #A3b1--rQ   c                    |  d| S N & rp   r   s     rO   logical_andzTritonOverrides.logical_and      Cs|rQ   c                    |  dS )Nz == 0rp   rc   s    rO   logical_notzTritonOverrides.logical_not  s    E{rQ   c                    |  d| S Nz | rp   r   s     rO   
logical_orzTritonOverrides.logical_or  r8  rQ   c                    d|  d| dS )Nr    ^ rG   rp   r   s     rO   logical_xorzTritonOverrides.logical_xor  s    1#S1~rQ   c                    |  d| S r5  rp   r   s     rO   bitwise_andzTritonOverrides.bitwise_and  r8  rQ   c                    d|  S )N~rp   r:  s    rO   bitwise_notzTritonOverrides.bitwise_not  s    1#wrQ   c                    |  d| S r=  rp   r   s     rO   
bitwise_orzTritonOverrides.bitwise_or  r8  rQ   c                    |  d| S )Nr@  rp   r   s     rO   bitwise_xorzTritonOverrides.bitwise_xor  r8  rQ   c                    |  d| S )Nz << rp   r   s     rO   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      D}rQ   c                    |  d| S )Nz >> rp   r   s     rO   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  rM  rQ   c                     d| d}d|  d| dS )Nr   ).to(tl.uint32)ztl.rand(rW   rG   rp   seedoffsets     rO   randzTritonOverrides.rand  s%    VHO,$r&++rQ   c                     d| d}d|  d| dS )Nr   rQ  z	tl.randn(rW   rG   rp   rR  s     rO   randnzTritonOverrides.randn  s%    VHO,4&6(!,,rQ   c           	     ,    d| d}d|  d| d| d| d	S )Nr   rQ  ztriton_helpers.randint64(rW   rG   rp   )rS  rT  lowhighs       rO   	randint64zTritonOverrides.randint64  s1    VHO,*4&6("SED6KKrQ   c                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load(z + load_seed_offsetrG   )r0   r   rI   inputseed_offset)r   rT  r  s      rO   	load_seedzTritonOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
rQ   c                    d|  dS )Nztl.math.rsqrt(rG   rp   r   s    rO   rsqrtzTritonOverrides.rsqrt  r   rQ   c                    d|  dS )Nztl.math.log1p(rG   rp   r   s    rO   log1pzTritonOverrides.log1p  r   rQ   c                    d|  dS )Nztl.math.tan(rG   rp   r   s    rO   tanzTritonOverrides.tan  r   rQ   c                    d|  dS )Nztl.math.tanh(rG   rp   r   s    rO   tanhzTritonOverrides.tanh  r   rQ   c                    d|  dS )Nztl.sigmoid(rG   rp   r   s    rO   sigmoidzTritonOverrides.sigmoid  s    QCq!!rQ   c                    d|  dS )Nz1/(1 + tl.math.exp(-(z)))rp   r   s    rO   libdevice_sigmoidz!TritonOverrides.libdevice_sigmoid  s    &qc--rQ   c                    d|  d|  d|  dS )Nztl.math.signbit(z) if (z).dtype is tl.float32 else z < 0rp   r   s    rO   signbitzTritonOverrides.signbit  s      "!F1#-H4PPrQ   c                    d|  d| dS )Nztl.math.fmod(rW   rG   rp   r   s     rO   fmodzTritonOverrides.fmod  s    qcA3a((rQ   c                    d|  d| dS )Nztl.math.pow(rW   rG   rp   r   s     rO   powzTritonOverrides.pow  s    aS1#Q''rQ   c                    d|  dS )Nztl.log(rG   rp   r   s    rO   logzTritonOverrides.log  r   rQ   c                    d|  dS )Nztl.math.log(rG   rp   r   s    rO   libdevice_logzTritonOverrides.libdevice_log  r   rQ   c                    d|  dS )Nztl.math.isinf().to(tl.int1)rp   r   s    rO   isinfzTritonOverrides.isinf      s-00rQ   c                    d|  dS )Nztl.math.isnan(ry  rp   r   s    rO   r   zTritonOverrides.isnan
  r{  rQ   c                    d|  dS )Nztl.math.nearbyint(rG   rp   r   s    rO   roundzTritonOverrides.round  s    #A3a((rQ   c                    d|  dS )NrF   rG   rp   r   s    rO   floorzTritonOverrides.floor  r   rQ   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N // z % z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), rG   rp   )rc   rd   quotrems       rO   floordivzTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddrQ   c                    d } |t        j                  d|             } |t        j                  | d            }t        j                  ||      }| d|  dS )Nc                    |  dS )N.to(tl.int8)rp   ss    rO   to_intz$TritonOverrides.sign.<locals>.to_int!  s    S%%rQ   r   r   r  )r/   ltsub)r   r  leftrightr  s        rO   signzTritonOverrides.sign  sT    	& cffS!n%svva~&ggdE"d1#W%%rQ   c                    d|  dS )Nztl.math.trunc(rG   rp   r   s    rO   trunczTritonOverrides.trunc)  r   rQ   c                    |  d| S )Nr  rp   r   s     rO   truncdivzTritonOverrides.truncdiv-  s     D}rQ   c                    d|  dS )Nztl.math.ceil(rG   rp   r   s    rO   ceilzTritonOverrides.ceil3  r   rQ   r   )r   r   r   zOptional[torch.dtype])r   r   )Nrm   rn   ro   __doc__staticmethodr   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r"  r%  r'  r)  r+  r-  r/  r1  r3  r7  r;  r>  rA  rC  rF  rH  rJ  rL  rO  rU  rW  r[  ra  rc  re  rg  ri  rk  rm  ro  rq  rs  ru  rw  rz  r   r~  r  r  r  r  r  r  rp   rQ   rO   r   r      s   (+7 +7Z E E @ @.   # #   # # $ $ % %   $ $  " 3 3 3 3 * *   # #   # #   	2 	2 & & # # $ $ $ $ $ $ % % $ $ % % * * $ $ % % - - $ $ & & * * % % . .                     , , - - L L 
 
 % % % % # # $ $ " " . . Q Q ) ) ( (   # # 1 1 1 1 ) ) % % e e & & % %  
 $ $rQ   r   c                       e Zd ZdZ ej
                  d       ej
                  d      d	 	 	 	 	 	 	 	 	 	 	 d fdZd Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    r2   )divisorlengthc                   t         	|           || _        || _        || _        || _        || _        || _        || _        || _	        y r   )
r   r   r   var_list
var_rangesnumelprefixr  r  r   )
rM   r   r  r  r  r  r   r  r  r   s
            rO   r   zIterationRanges.__init__H  sH     		 $
rQ   c                R    | j                   dk(  xr | j                  j                   S )Nr)r  r   persistent_reductionrM   s    rO   is_loopzIterationRanges.is_loop^  s#    {{c!J$++*J*J&JJrQ   )r   r}   r  zList[sympy.Symbol]r  zDict[sympy.Symbol, sympy.Expr]r  
sympy.Exprr  r}   r   TritonKernel)	rm   rn   ro   r  r^   Integerr   r  r   r   s   @rO   r  r  8  sk    . a u}}Q % 3	
   ,KrQ   r  c                  t     e Zd Z	 d	 	 	 	 	 	 	 	 	 d fdZd Zd ZddZddZddZd Z	d Z
d	 Zdd
Z xZS )IterationRangesRootc                b    |i }t         |   |g i |||       || _        i | _        || _        y )N)r   r  r  r  r  r   )r   r   indexnodes	pid_cache)rM   r   r  r  r  r   r  r   s          rO   r   zIterationRangesRoot.__init__c  sM     I 	 	
 
=?
 *3rQ   c                b    | j                   j                         D ]  }|j                           y r   )r  valuescache_clear)rM   nodes     rO   r  zIterationRangesRoot.cache_clear}  s%    JJ%%'D (rQ   c                   t         j                  j                  j                  ||z  | j                        r#t        t        | j                   d      |      }n#t        t        | j                   d      ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        r  )r0   graphsizevarsstatically_known_equalsr  r   r,   r  r   r  IterationRangesEntrynextr   iter_vars_countrange_tree_nodessymbolr  appendr  )rM   r  r  rN   r  s        rO   lookupzIterationRangesRoot.lookup  s	    7733Gf4DdjjQLDKK=)>?ID"<4;;-u0E#FQWXDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$rQ   c                    t        j                  d      }g }t        |      D ](  }|j                  | j	                  ||             ||z  }* t        t        |            S Nr2   )r^   r  reversedr  r  list)rM   lengthsr  itervarsr  s        rO   construct_entriesz%IterationRangesRoot.construct_entries  sW    --"w'FOODKK89&G ( HX&''rQ   c                f    | j                  |      D cg c]  }|j                          c}S c c}w r   )r  r  )rM   r  es      rO   	constructzIterationRangesRoot.construct  s-    $($:$:7$CD$Cq
$CDDDs   .c           
     `  	 |j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  d        t        j                  d      g g 		fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                        s, || j                  t        | j                                     t!        t#                    t!        t#        	            fS c c}w c c}w )z,Figure out vars from this tree used in indexc                h    t         j                  j                  j                  | j                        S r   )r0   r  r  	size_hintr  r   s    rO   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    !1!1!;!;AII!FrQ   keyr2   c                    j                  | j                                j                  | j                         | j                  z  y r   )r  r  r  )r  r  
index_varssizess    rO   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+GrQ   )free_symbolsr0   r   r  getr  sortr^   r  r  r  r  r  r  r   r  r  r  )
rM   r  r  r  nr  r  r  r  r  s
          @@@rO   vars_and_sizesz"IterationRangesRoot.vars_and_sizes  sB   ;@;M;MN;Ma**..q1;MN!CEqQ188t{{+BEC

F
G--"
	, D77##;;DLL'RDKK$,,)HIJ,,I  ww77

GLGXdjj'%BCDHZ()4+@@@/ OCs   0F&F+F+*F+c                    | j                   j                  | j                  | j                        }| j                   j                  }|dk7  rd| dnd}d| j                  j                          d| | S )Ntl.int32r   rG    ztl.arange(0, zBLOCK))r   indexing_size_strr  r  index_dtypeupper)rM   sizer  converts       rO   ranges_codezIterationRangesRoot.ranges_code  sk    {{,,TZZEkk--+6*+DDQ'"t{{00236$yIIrQ   c                    | j                   j                  }| j                   j                         }dg|z  }d| d| d| dS )Nr2   r   rW   rG   )r   r  r   )rM   r   r  r   r  s        rO   scalar_codezIterationRangesRoot.scalar_code  sI    kk--{{--/sTz$r%;-q99rQ   c                    d| j                    d}| j                  j                  ||      }| j                  j                  dk7  r| d| j                  j                   dS |S )Nztl.program_id(rG   r  r   )r  r  r  r   r  )rM   r  pids      rO   get_pidzIterationRangesRoot.get_pid  s_    tzzl!,nn  c*;;""j0U$t{{667q99
rQ   c                H   | j                   }| j                         r%|j                  | j                   d| d| d       n|dk(  rE| j                  j
                  r/|j                  | j                   d| j                                 ns|s| d| j                          }n| j                  | d      }|j                  | d| j                          d|j                          d| j                   d| g       |j                  | d	| j                   d
| d       y )N = z	offset + baser  rT  z	offset = z * BLOCKzmask = z < r  )r  r  	writeliner   r   r  r  r  
writelinesr  r  )rM   codeno_x_dimr   lines        rO   codegen_headerz"IterationRangesRoot.codegen_header  s   KK<<>NNdii[A3is$?@#X$++::NN99+S!1!1!3 45 Id&6&6&8%9:''1#V5OOc4<<>"2#aggi[FyykTF+ 	!GDII;c!E:;rQ   r   )
r   r}   r  r  r  r}   r  r   r   r  )r  zList[sympy.Expr]r  r  )F)rm   rn   ro   r   r  r  r  r  r  r  r  r  r  r   r   s   @rO   r  r  b  sm     33 3 	3
 3 34 .(EA6J:<rQ   r  c                  d     e Zd Z	 	 	 	 	 	 	 	 	 	 d
 fdZd Zd Zd Zd Zd Zd Z	d Z
d	 Z xZS )r  c           
        t         |   ||j                  |z  |j                  |j                  |j
                  |||j                         || _         t        j                  d       | j                        | _        || _        y )N)r   r  r  r  r  r  r  r   )r   r   r  r  r  r  r   parent	functools	lru_cache_codegencodegenrN   )rM   r   r  r  rN   r  r   s         rO   r   zIterationRangesEntry.__init__  sw     	,,'__((==== 	 		
 0y**40?	rQ   c                L    fd| _         d | j                   _        | _        y )Nc                      S r   rp   r   s   rO   r  z/IterationRangesEntry.set_name.<locals>.<lambda>   s    trQ   c                      y r   rp   rp   rQ   rO   r  z/IterationRangesEntry.set_name.<locals>.<lambda>  s    4rQ   )r  r  r   )rM   r   s    `rO   set_namezIterationRangesEntry.set_name  s    ##/ 	rQ   c                8    | j                   j                          y r   )r  r  r  s    rO   r  z IterationRangesEntry.cache_clear  s      "rQ   c                    | j                         r*t        j                  j                  j	                  |       y t        j                  j
                  j	                  |       y r   )r  r0   r   indexing_coder  r	  )rM   r  s     rO   r  zIterationRangesEntry.writeline  s;    <<>HH"",,T2 HHMM##D)rQ   c                    | j                  | j                   dt        t        j                  j                  | j                              z          | j                  S )Nr  )r  r   texprr0   r   rename_indexingrN   r  s    rO   r  zIterationRangesEntry._codegen  s@    $))C(51I1I$))1T+UUVyyrQ   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr2   r   c              3  R   K   | ]  }|j                   j                  d        ! yw)r  Nr   
startswith.0r  s     rO   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>  s      +TGqAFF,=,=c,BGs   %')r   rN   r^   r   r   r   typerI   r  r  rH   allr  )rM   precomputed_argsr   symbolss       rO   r  z%IterationRangesEntry.precomputed_args  s    -/dii.##$))h%@AR4		?RA99>>!"%CcEMM5<<#@A**w<!#+TG+T(T$++C0	 &
  rQ   c                ,    t        | j                        S r   )r,   r   r  s    rO   r  zIterationRangesEntry.symbol  s    DII&&rQ   c                ,    t        | j                        S r   )hashr   r  s    rO   __hash__zIterationRangesEntry.__hash__"  s    DIIrQ   c                4    | j                   |j                   k(  S r   r  )rM   r
  s     rO   __eq__zIterationRangesEntry.__eq__%  s    yyEJJ&&rQ   )
r   r}   r  r  r  r  rN   r  r  r  )rm   rn   ro   r   r  r  r  r  r  r  r  r  r   r   s   @rO   r  r    s`      	
   ,
#* ''rQ   r  c                      e Zd ZeZeZddej                  dd	 	 	 d0 fdZ	d Z
d1dZd Zd Zd	 Zd
 Ze	 	 	 	 d2d       Ze	 	 	 	 d2d       Zd3dZd4dZd4dZd5dZd6dZdddd	 d4dZd Zd Zd7dZej<                  d        Z fdZ d Z!e"d8d       Z#d4dZ$d9dZ%d:dZ&	 	 	 	 	 	 	 	 	 	 d;dZ'd  Z(ed!        Z)d" Z*d# Z+d$ Z,d% Z-d& Z.d:d'Z/d( Z0d) Z1d<d*Z2d+ Z3d:d=d,Z4d- Z5d. Z6d/ Z7 xZ8S )>r  Nr   )	mutationsr  reduction_hintr   c                   |i }t         	           |D cg c]+  }t        j                  j                  j                  |      - c} _        ||n	t                _        g  _	        i  _
        t        j                          _         j                  d   dk7   _        t                _        t                _        t                _        t                _        | _        | _        | _        t                _         j1                          _         j(                  t4        j6                  k(  xr:  j2                  xr, t9         j                        dk(  xr  j                  d   dk\   _         j=                  |       t                _        tA        jB                  d       d fd       }| _"        y c c}w )Nrs   r2   r      c                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  | S r   )r0   r  r  simplify_with_rangesr  range_treescombine_contiguous_dims)r  treerM   s     rO   simplify_indexingz0TritonKernel.__init__.<locals>.simplify_indexingU  sM    GG$$99%ARSE((44UDA )LrQ   r  )#r   r   r0   r  r  simplifynumelsr   r  r  r  	itertoolscountr  inside_reductionr6   r	  r  suffixoutside_loop_varsr  r  r   
last_usageshould_use_persistent_reductionr  r   INNERrH   r  initialize_range_treeautotune_hintsr  r  r   )
rM   r  r  r  r  r   groupsr  r   r   s
   `        rO   r   zTritonKernel.__init__-  s    I=CDVqww''003VD090E93568JL(0 $B1 4"$	+-&4&6+.5, +#6 $'E*.*N*N*P!=#6#66 '))'DKK A%' B3&	 	 	""9- 25 
		T	"	 
#	 "3G Es   0F;c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Trp   r  s    rO   need_numel_argszTritonKernel.need_numel_args^  s     rQ   c                   | j                   rt        j                  j                  syt        j
                  dij                  | j                  d      }| j                  d   }t        |t        t        j                  f      syt        j                  j                  j!                  |      }||kD  ryt        j                  j                  j#                  | j                  d   t%        |             y)z^
        Heuristic to set self.persistent_reduction and add guards
        if needed.
        Fi   r   rs   T)r%  r   r   persistent_reductionsr   r*  r  r  r"  r   r   r^   r  r0   r  r  r  	guard_leqr(   )rM   	threshold
last_numelhints       rO   r)  z,TritonKernel.should_use_persistent_reductionh  s    
 %%&--*M*M

#d!!2
& 	 [[_
*sEMM&:;ww))*5)	""4;;r?OD4IJrQ   c                    | j                   r| j                  ry t        t        j                  j                  d |D                    | _        y )Nc              3  F   K   | ]  }|t         us|j                    y wr   )EnableReductionr(  )r
  r  s     rO   r  z.TritonKernel.set_last_usage.<locals>.<genexpr>  s      *&+q/Ges   !!)r%  r  r   r#  chainfrom_iterabler(  )rM   r  s     rO   set_last_usagezTritonKernel.set_last_usage}  sA    $$(A(AOO)) *&+* 
rQ   c                   t        t        g dd t        | j                        dz
               dgz   }t	        t        | j                              D ]e  }||   d   dk(  r|ndj                  ||   d         }| j                  j                  t        ||   | j                  |   ||   d   || |             g | j                  D ]9  }|j                         r|j                  | j                  | j                         ; | j                  rX| j                  d   j                         r:| j                  j                  d| j                  d   j                                 y y y )	N)xindexyindexzindexr2   rindexr   r  xyzrs   zrbase = )r  r  rH   r"  rangefindr  r  r  r  r  r	  r  r%  r  r  )rM   r  namesipid_idxr  s         rO   r+  z"TritonKernel.initialize_range_tree  s@   34Jc$++6F6JKL
J s4;;'(A 8A;#-a5::eAhqk3JG###!Hdkk!neAhqk7D) ) $$D<<>##DIIt}}= %   T%5%5b%9%A%A%C II(4+;+;B+?+K+K+M*N OP &D rQ   c                B     t         j                   fd       } |       S )Nc               3    K    j                   d   dk(  r j                  rJ d  y  j                  s j                          d _        	 d   j                  s j                          d _        y # d _        w xY ww)Nrs   r2   FT)r"  r%  r  codegen_bodyr  s   rO   ctxz+TritonKernel.disable_reduction.<locals>.ctx  sx     {{2!#0000,, !!#$)D!-00%%'(,%%s   A	B A5 -B5	A>>B)
contextlibcontextmanager)rM   rJ  s   ` rO   disable_reductionzTritonKernel.disable_reduction  s$    		"	"	- 
#	-$ urQ   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w r   )rH   r  zipr  )rM   r  r  rangess       rO   
set_rangeszTritonKernel.set_ranges  sb    7|s4#3#34444 #&gt/?/?"@
"@ V$"@
 	
 
s   Ac                   t         j                  j                  | D cg c]  }g  c}| D cg c]  }j                  |       c}t	        j
                         fd}d }g }d}|D ]C  }g }	|D ]'  }
j                  |
d      r|	j                  d        )|t              k  rBj                  |         dk(  r+|dz  }|t              k  rj                  |         dk(  r+j                  |
      j                  |         kD  r^j                  |
|         s
t               |   }t        |
|         }|	j                   || |||       ||dz   |                   |	j                  t        j                   |||
                   * |j                  |	       F t        d D              sJ d d|        |fS c c}w c c}w )	Nc                    j                  |      }j                  |    |      s
t               t        |    |      | <   |    j	                  |       t              S r   )r!  statically_known_multiple_of	CantSplitr   r  r  )rE  rN   
new_ranges	remainingsv	var_counts     rO   	add_rangez7TritonKernel._split_iteration_ranges.<locals>.add_range  s]    ;;t$D229Q<Fk!#IaL$7IaLqM  &	?"rQ   c                      fd}|S )Nc                     |    z  |    z   S r   rp   )	flat_varsidx1idx2r  s    rO   getterzKTritonKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??rQ   rp   )r  r^  r_  r`  s   ``` rO   make_combinedz;TritonKernel._split_iteration_ranges.<locals>.make_combined  s    @ MrQ   r   r2   c                ,    t        j                  d      S Nr   )r^   r  )_s    rO   r  z6TritonKernel._split_iteration_ranges.<locals>.<lambda>  s    EMM!4DrQ   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 ywr2   Nr0   r  r  r  r	  s     rO   r  z7TritonKernel._split_iteration_ranges.<locals>.<genexpr>  s.      
8A1AGG&&q)Q.	s   68zfailed to set ranges  )r0   r  r  r!  r#  r$  r  r  rH   r  rT  rU  r   operator
itemgetterr  )r-  r  rd  grZ  ra  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2rV  rW  rX  rY  s                @@@@rO   _split_iteration_rangesz$TritonKernel._split_iteration_ranges  s    WW:@-A&Qb&-A
-34VR[[^V4	OO%		#	 !##LN$--dA6"))*DE "C	N2Y}%=>!C "Q&M	 "C	N2Y}%=>!C
 <<%Y}5M(NN::i6 (k)%m4E$T9]+CDE"))%!%mU;%ma&7? #)) ++ImT,JK9 %> "((8C $F  
8A
 
 	9"9+Qwi8	9 
 000y .B4s
   	GG"c                H    	 | j                  ||       y# t        $ r Y yw xY w)NTF)rr  rU  )r   r-  r  s      rO   is_compatiblezTritonKernel.is_compatible  s,    	''8 		s    	!!c           
        | j                   D cg c]  }|j                   }}| j                  st        j                  d      |d<   t        |      t        | j                         k(  r+t        d t        ||      D              r | j                  | S | j                  ||      \  }}t        t        j                   | j                  |        }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        r2   rs   c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr   Nr0   r  r  r!  r*   )r
  r   rk  s      rO   r  z4TritonKernel.split_and_set_ranges.<locals>.<genexpr>  s@      9
,1 GG%%mA&6&:;q@,s   AA)r  r  r%  r^   r  rH   r  rO  rQ  rr  r  r#  r9  )	rM   r  rtr-  rV  rl  r  fnsfns	            rO   split_and_set_rangesz!TritonKernel.split_and_set_ranges  s     &*%5%56%5r"((%56$$q)F2Jw<3t//00S 9
GV,9
 6
 #4??G,,,0,H,HG-
)
) 	*)EFG8MN8M,"H,8MNN 7 -Ns   C7	DC<.D<Dc                    t        |d      S )Nr   )r5   rM   r  s     rO   is_indirect_indexingz!TritonKernel.is_indirect_indexing!  s    %eU33rQ   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                        D              S )NFr2   c              3  F   K   | ]  \  }} |       |      k7    y wr   rp   )r
  	idx_range
iter_ranger!  s      rO   r  z.TritonKernel.is_broadcasted.<locals>.<genexpr>6  s-      
)G%	: Y8J#77)Gs   !)r  rH   r"  r  r  r   r  r  r  r  r0   r  r  r!  anyrO  )rM   r  index_numelsr  entryr!  s        @rO   is_broadcastedzTritonKernel.is_broadcasted%  s    $$U+sS--((FT222))&1Eell,?@@@++,<, ) 77##,, 
),\4;;)G
 
 	
rQ   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r2   )r   r^   r  r   r  rH   r0   r  r  _simplify_loopsr7   r  r+   dictrO  )
rM   r  r  r  r  	new_sizesreindexprunenew_index_vars	new_indexs
             rO   r  z$TritonKernel.combine_contiguous_dims;  s     eemmU\\:;L //6
Eu:?L$%GG$4$4$D$D7US%
!	7E L	2ud3z7>;R+S&TU	rQ   c                T    t        | j                  | j                  |                  S )a  
        Convert an index expr to a string that can be used in triton code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        )r  r  codegen_indexingr~  s     rO   index_to_strzTritonKernel.index_to_strM  s%     T))$*?*?*FGHHrQ   F)
copy_shapedense_indexingoverride_maskc               P   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i |j                  }| j                  |      }| j                  |      }	t!               }
|D ]  }t#        |t        j$                        sJ |r"|j&                  j)                  d      r?| j*                  j,                  |j&                     }|
j/                  |j0                         ||j&                  j)                  d      r|j&                  d   dv sJ |j&                         |
j3                  |j&                  d    d        t4        j6                  j8                  xs |xs | j:                  duxr |dk7  }d}d	}t!               }| j<                  D ]\  }|j>                  d
k(  r| j@                  s|jC                  |jD                        rd}nd	}|j3                  |j>                   d       ^ d}t#        |t        jF                        r.|r| dn| jI                         }d| d|	 d}	|	t!               d|fS |r%|s#|r| dn| jI                         }d|	 d| d}	|}
n|s|rd|	 d| d}	|}
|r|h}
| j:                  r|
j3                  | j:                         | jK                  |
       |
r(djM                  tO        tQ        tR        |
                  nd}|	|
||fS )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        r   c              3     K   | ]<  }|j                   j                  d       xs |j                   j                  d       > yw)r  psNr  r	  s     rO   r  z(TritonKernel.indexing.<locals>.<genexpr>s  s:      ,OV!AFF%%c*Eaff.?.?.EEws   AAr   )r  r  rE  r   r   NTFr  z.shaper   rW   z, tl.int32)Nonetl.broadcast_to(rG   .shape)r6  )*r   r+   r0   r  r  precomputed_replacementsrH   atomsr^   r  ceilingsubsr  r  lookup_precomputed_sizer  r   r   r   r   r  r   varname_mapr   r   r  r   r   r  
_load_maskr  r  r%  intersectionr  r  dense_size_strfilter_masksjoinsortedmapr}   )rM   r  r  r  r  rc   r  replacementsr  r  r   r  cse_var
need_dense
have_densehave_loop_varsdense_mask_varsr  r  mask_strs                       rO   r   zTritonKernel.indexingX  s    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ ..w<!# ,OV, ) %&qww'7'7'O'OPQ'R#SL&ul;E 0 ''
&&u-%%e,	!e	Cc5<<000$$U+((..sxx8  !2!23$$%56 xx{e+5SXX5+!T23   MM(( ++d* qj	 	 
%$$D{{c!$*?*?&&t}}5!%"
4;;-t 45 % 
eU]]+2<J<v.$BUBUBWJ":,b;GIceVZ77j2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I&I??MM$//*)$>G5::fSi%89:V)Xz99rQ   c                J   | j                   D ]  }t        j                  j                  j	                  |j
                  d      r|j                  |j                   d       W|j                  j                         t        j                  j                  vrt        j                  j                  |j                  j                            }t        j                  j                  j                  |j
                  |      s|j                  |j                   d        y )Nr2   r   )r  r0   r  r  r  r  discardr  r  r   r   	max_blockrT  )rM   r   r  r  s       rO   r  zTritonKernel.filter_masks  s    $$Dww77

AF!!T[[M"67 {{  "&--*A*AA//0A0A0CDI
 ww<<TZZS!!T[[M"67 %rQ   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wr   )r  items)r
  r  s     rO   r  z*TritonKernel.var_ranges.<locals>.<genexpr>  s"      *4DD%%'4Ds   $&)r  r#  r9  r:  r  r  s    rO   r  zTritonKernel.var_ranges  s4    OO)) *484D4D* 
 	
rQ   c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr  r   )r0   r  r  r  r  r  r  r}   r  r  r  rH   r+   rN   r  )rM   rN   symr  r  s        rO   r  zTritonKernel.codegen_indexing  s    ww44T4??;LM$++5Cd+++  "//4EEGB'(ww'7'7'O'OPR'SL$ H|$q(6@--c2777D))#.3 %%c*224 6 rQ   c              #     K   | j                   }|r+| j                  j                  | j                  | d|       }|| _         	 | || _         y# || _         w xY ww)z:Context manager to add an additional mask to tl.load/storer6  N)r  r   r   r   )rM   r   priors      rO   r  zTritonKernel.mask_loads  sX      88$$T\\dV3ug3FGD	$J#DOeDOs   AAA A	AAc                ^    t         j                  j                  d u xr t        |   |      S r   )r   versionhipr   generate_assert)rM   checkr   s     rO   r  zTritonKernel.generate_assert  s'    }}  D(KUW-DU-KKrQ   c                   d}t        |j                        }| j                  r|j                  | j                         |r?t	        |      dk(  rt        t        |             nddj                  d |D               d}|S )Nr  r2   r   r6  c              3  2   K   | ]  }t        |        y wr   )r}   )r
  vs     rO   r  z)TritonKernel.load_mask.<locals>.<genexpr>  s     #>IqCFIs   rG   )r   r   r  r  rH   r  iterr  )rM   r  r   r   s       rO   	load_maskzTritonKernel.load_mask  sv    &	??MM$//* y>Q& Y()#>I#>>?qA 
 rQ   c                     y)Nztl.device_assertrp   r  s    rO   assert_functionzTritonKernel.assert_function  s    !rQ   c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r2   r   )r  r  rN   r+   r  r,   r   )	rM   r  kr  index_to_tile_indexesindex_in_tile_varsstrides
range_treer  s	            rO   get_strides_of_loadz TritonKernel.get_strides_of_load  s     8<7L7L7R7R7T U7Ttq!AFF7T U'/DE**JZ__-A#$6A?*"QFC GAJ +
  !Vs   Bc           	        | j                   j                  |      }| j                  |      }|}| j                  |      \  }}}}t	        d | j                  |      j                         D              }	| j                  |      rd}
n|	sd}
n| j                  r| j                  sx|| j                   j                  v r-t        | j                   j                  |   j                        }n|h}t        || j                  z        dkD  }| xr d|v xs |}|rd}
nd}
nd}
d|v sd|v r3t        j                   j#                  |      t$        j&                  k7  rd}nd}d }t        j                   j)                  |      r|}nt+        |t,        j.                        rd	| d
| d}|}nd	| d
| d| |
 | d	}t        j                   j#                  |      }|t$        j0                  t$        j2                  fv r|dz  }|t$        j&                  k(  rt$        j4                  j6                  |dz  }d|v r| j8                  }n7| j                  r| j                  sd|vr|s| j:                  }n| j<                  }| j>                  jA                  ||      }t+        |tB              sJ ||_"        |r%d| d| d}| j>                  jA                  ||      }| j                  rd|vr| jF                  jI                  |       |S )Nc              3  &   K   | ]	  }|d k(    ywrf  rp   )r
  rE  s     rO   r  z$TritonKernel.load.<locals>.<genexpr>.  s      
MqAFM   z, eviction_policy='evict_last'r   rmaskz, eviction_policy='evict_first'r  r   z, other=0.0r]   + ())r  rG   z.to(tl.float32)z.to(tl.int1)r  rW   )%rI   r_  r  r   r  r  r  r  r%  r  inplace_buffersr   other_namesrH   r(  r0   r  	get_dtyper   rt   is_unspec_argr   r^   r  rv   rw   r  r  r   r	  loadsr   r   r   r   r'  r  )rM   r   r  r  indirect_indexingoriginal_indexr   r   r  is_coalescedeprD  last_use
evict_lastr
  append_broadcastr  r   load_buffer
result_vars                       rO   loadzTritonKernel.load  s   iiood# 55e<-1]]5-A*y$
  
 44^DKKM
 
 ~.1B1B""4+D+Dtyy000DII55d;GGH54??23a7H%P7d?+O>OJ56B TMW_!''2C2CD2IUZZ2W!EE77  &D.%--8!#d>*:"=#- !#d5'TF2$ugQGGG%%d+E77))

"u}}'8'8'@ &D=,,K!!--t#% ))K**KXX&&{D9
*&7888(
%j\4D3EQGD**;=J$$t(;""&&z2rQ   c           	     8   | j                   j                  |      }| j                  |      }|}| j                  |d      \  }}}	}
|| j                   j                  v }| j                  |      }|r'|r%| j                  j                  t        |d             |d| d| d| d|	 d	}n#|d	k(  rd
| d| d| d|	 d	}nt        d|       | j                  j                  t        ||             | j                  s| j                  j                  |       y y )NT)r  ztl.debug_barrier()	tl.store(r  r  rW   rG   
atomic_addztl.atomic_add(zstore mode=)rI   outputr  r   r  r  storesr  r4   NotImplementedErrorr%  r'  r  )rM   r   r  r   moder  r  r  r   r   r  
is_inplacer  r  s                 rO   storezTritonKernel.storey  s'   iit$ 55e<-1]]5QU]-V*y$
 TYY666
,,^<.KK!!,t5I"JK<se4wc%4&BD\!#C5UG3ugRvQGD%D6&:;;l467$$""&&u- %rQ   c                   | j                   j                  t        j                         | j                  j                  |      }| j                         }| j                  |      }|t        j                  k(  rd}	n!|t        j                  k(  rd}	nt        d      | j                  j                  | j                  d| d| d|	 d| d| d| d      }
|
S )z3
        See [Note: Inductor bucketize op]
        r  tl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(rW   rG   )r,  r  r"   ELEMENTS_PER_WARP_32rI   r_  r  r  r   r  r   r  r   r   r   )rM   r  offsets_nameoffsets_sizeindexing_dtyper  offsets_ptr
block_sizeoffsets_size_strtriton_dtyper  s              rO   	bucketizezTritonKernel.bucketize  s      	 A ABiiool3((*
,,\:U[[(%Lu{{*%L%G  ""LL5fXR}B|n\^_d^eeghxgyy{  }G  |H  HI  J

 rQ   c                |    | j                         }|dk(  rd| dS dg|z  }d|d<   | ddj                  |       d	S )
Nr2   z!triton_helpers.promote_to_tensor(rG   :r  rs   [rW   ])r   r  )rM   r   ndimsr  s       rO   reduction_resizezTritonKernel.reduction_resize  sW    '')A:6ugQ??b	$))E*+1--rQ   c                \    t        |t              rt        t        | |            S  | |      S r   )r   tupler  )r{  r   s     rO   _map_tuple_or_scalarz!TritonKernel._map_tuple_or_scalar  s'    eU#R((%yrQ   c                \   ()*+  j                   sJ  j                  D ch c]  }|j                   d }} j                  |       t	        |      } j
                  r|j                   j
                          j                  d   j                  } j                  D cg c]  }d }	}d|	d<    j                         ) j                  ) fd|      }* fd}
*+ fd}||f}| j                  j                  v r j                  j                  |   S t         j                        dz
  t        t         j                              z
  *t        |      } j                  j!                         }|D ch c]  }|d	   d
k7  s| c}|_        dj%                  |      ( j&                  r"t(        j*                  j-                  |      } j                  t.        |      }( fd}t1        |t2              r&t5        ||      D cg c]  \  }} |||       }}}n	 |||      }dv rUt7         j                  j9                   j:                  d| d| d            }ddd   + | j:                  |||       ndk(  rt=        j>                  ||d|      }d _         t=        j@                   jB                  d   |      }t=        jD                  ||      }d _         t=        jF                  ||      }t=        jH                  ||      }t=        j>                  ||d|      }|||f}ndk(  rl|\  }}}d| d| d| d* d	} fdtK        d      D        \  }}} j:                  jM                  | d| d| d|        t3         fd|||fD              }n j                  j9                   j:                   |
|            }nZd | }t(        j*                  jO                  |      } j                  t.        |      }t1        |t2              s5 jP                  jM                  | d! j                          d| d| d       dv rd | d"}tS        jT                  tR        jV                        jX                  } jP                  jM                  | d! j                          d| d#       ddd   + j:                  j[                  d$j%                  g d%| d&| d'+ d(| d| d| d| d)| d*( d| d&| d+| d*( d| d&| d+              | j\                  |||       nt_              r| d,}| d-} | d.}! jP                  jM                  | d/ j                          d| d        jP                  jM                  |  d/ j                          d| d        jP                  jM                  |! d/ j                          d| d       dk(  r>|\  }}} j:                  j[                  d0| d&|  d&|! d1| d|  d|! d2| d| d| d3       n8dk(  sJ  j:                  j[                  d0| d&|  d&|! d4| d| d|  d|! d5        j:                  j[                  d%| d*( d| d&| d+|  d*( d|  d&|  d+|! d*( d|! d&|! d+       |}" j                  j!                         }# j                  j!                         }$ j\                  j[                  d%|" d6|# d6|$ d7| d|  d|! d* d8|" d ja                  |" d9       d:|# d ja                  |# d9       d:|$ d ja                  |$ d9       d:       |"|#|$f}nt)        jb                  |      }% |%||      }& j:                  jM                  | d*( d|& d| d       |tR        j                  k(  r;| d;}te        |      }' j\                  jM                  | d |
|       d<|' d       n& j\                  jM                  | d |
|              | j                  j                  |<   t1        |t2              r  xjf                  ti        |      z  c_3        |S  jf                  jk                  |       |S c c}w c c}w c c}w c c}}w )=Nr   rs   r  r  c                ^    j                   j                  j                  d|  d d      S )Nr  rW   rG   r   r   r   )r  r  rM   s    rO   r  z(TritonKernel.reduction.<locals>.<lambda>  s.    dhh'' 02n5EQGrQ   c           
         dv }|rdnd}dv rj                  | d d|  d d      S j                  | d d	|  d d      S )
N>   r  r   minprodtriton_helperstl>   r   r  rr   z2(rW   rG   r   )r  )r   
use_helpermoduledimreduction_typerM   s      rO   final_reductionz/TritonKernel.reduction.<locals>.final_reduction  s~    '+HHJ)3%F/,,ha/r%3%qA  ((F81^4DAeWBseST)UVVrQ   c                x    | j                  d| d d| d| d d| dj                  | d       d       y )	Nz                _, z_tmp = triton_helpers.z_with_index(rW   )
                r  _tmp
                )splicer  )bufferr  r   r  r	  root_oprM   s       rO   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  sg    MM<5gYl5'QSTYSZZ\]`\a bC 5 5D6I JK LrQ   r2   r   r  r6  c           
     d    j                   j                  j                  d d|  d| d      S r   r  )r   defaultcondrM   s     rO   _mask_valuez+TritonKernel.reduction.<locals>._mask_value  s8    xx((LLIdV2eWBwiq"I rQ   >   argmaxargminr  zindex, r  r   r  )r  r  welford_reducesumFTwelford_combineztriton_helpers.welford(rW   rG   c              3  R   K   | ]  }j                   j                            y wr   )r   newvar)r
  rd  rM   s     rO   r  z)TritonKernel.reduction.<locals>.<genexpr>*  s     #Hx!DHHOO$5xs   $'r   r  c              3     K   | ]9  }j                   j                  j                  j                  |             ; y wr   )r   r   r   r  )r
  var_namerM   s     rO   r  z)TritonKernel.reduction.<locals>.<genexpr>-  s9      #$6 HH%%dllD4I4I(4ST$6s   ?Ard  z = tl.full(_indexz, tl.int64)r  z                z_next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                z = tl.where(r  _mean_m2_weightz = tl.zeros(z                    z@_next = triton_helpers.welford_combine(
                        z,
                        z+
                    )
                    z?_next = triton_helpers.welford_reduce(
                        z,,
                    )
                    z_tmp, z3_tmp = triton_helpers.welford(
                    z#
                )
                r  r  r  r   )6r%  r  r  r  r  r  r  r  r  r   reduction_cacherH   r   rt   r  r   r  r   r  r  r   	Reductiondefault_valuer   r   r  rO  r}   r   r   r/   	reductionr  r"  truedivr  mulrB  r  default_accumulatorr	  r   iinfor   r   r  r&  r'   r  get_reduction_combine_fnr   r'  r   r  ),rM   r   r   r
  r   r  masksreduction_range_prefixrd  reduction_sizesr  r  	cache_keyacc_typer  r  r  r  r  dmasked_valueaccumulator_indexsum_rnumelmeandxdx2m2weightwelfordaccumulatorlong_maxaccumulator_m2accumulator_weightresult_mean	result_m2result_weight
combine_fnupdatedresult_typer  r  r	  r  s,   `  `                                    @@@@rO   r(  zTritonKernel.reduction  s{
   $$$$262B2BC2B$DKK=%2BC% u??LL)!%!1!1"!5!<!<+/+;+;<+;a6+;<! ,,.)) 	
	W	 6	00088++I66$""#a'#d4==.A*BB"9-((//+
/4FuA#uF
zz% $$ll00KG//IG
 %'>A%>QR>QdaAq 1>QR*5':!55$'HH%%*+A*B',W^_%! &+e<^LLL*l<M  #33 }}UE5%@(-%B?{{40(,%WWUD)ggb"o]]5%<"B/
#44#/ b&3D6B4r&C5PQR#HuQx#H b&&&$r"Rxs7)'LM" #%)2v$6# 

 "XX..LL/,"?
 j\*Kll66~yQG//IGgu-		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:! ;;u{{377		##()T5H5H5J4K2hZWbc &+e<^L##   $%6$77NOViX M!#$5#6689>?ABXAYZ 	 *	 +/	 02	 3>	 ?F	 GR]	S	
 ##
 $0
 15v
 68
 9J7J
 KR
 SdQd
e  ZFWX%n5!+E2$.<s!3(2|7%;"		##"m<0C0C0E/Fb
RST 		##%&l43F3F3H2IH:UVW 		##)*,t7J7J7L6MRPXzYZ[ "%66',$D"fLL'' M(8@R?S T$R'7r:L9M NbBvh / *-====LL'' M(8@R?S Tr+b0@CUBV W ##\$r+gk] S TF"^4DGNK[ \#$Lb9K8LGTfSg h ) HHOO-	 $ 1""VI;f]O D MN#326H5IC5 QS!6!6+d7K!L M N3t44	{$5GHI Js4#8#8M?$9O#P"Q R	 ))]B
88S
$[%8&&"m<vRy;-qQ 

* &1M">K"5e"<KKK))%,c/+*F)GtK=XYZ KK))%,c/+*F)GH /9  +j%(""c*o5"  ""&&z2y D =X  G  Ss   b%	b?b#b#b(c                   | j                   sJ d| _         | j                  |      \  }}}}d|vsJ d| _         | j                  j                  |      }| j                  j                  t        |d| d| d| d| d	             y )	NFr  Tr  r  r  rW   rG   )r%  r   rI   r  r&  r  r4   )rM   r   r  r   r   r   rd  r  s           rO   store_reductionzTritonKernel.store_reduction  s    $$$$ %$(MM%$8!y$e### $iit$3%tE7#eWBtfANO	
rQ   c                   | j                   s1| j                  s%| j                  s| j                  s| j                  sy| j
                  rJ| j                  s=| j                  j                  d       | j                  j                         5  | j                  d   j                  | j                         | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         ddd       | j                  j                  | j                         | j                  d   j!                          n| j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         | j                   j#                          | j                  j#                          | j                  j#                          | j                  j#                          | j                  j#                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nz(for roffset in range(0, rnumel, RBLOCK):rs   )r  r  r  r   r&  r%  r  r	  r  indentr  r  r  r   
invalidater'  r  clearr  s    rO   rI  zTritonKernel.codegen_body  s    zz{{||{{  )B)BII JK!!#  $33DII>		  !3!34		  ,		  .		  - $ HH 6 67R ,,.IIT//0IITZZ(IIT\\*IIT[[)		%  "

+ $#s   B=KKc                8   t               }| j                  j                         \  }}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]>  \  }}dt        |       }	t        j                  j                  |      }
|
r|j                  |	 dt        j                  j                  j                  |
j                                dt        j                  j                  j                  |
j!                                d|
j#                          d|
j%                          d
       nP|t        j                  j&                  v rt        j                  j&                  |   }|j                  |	 dt        j                  j                  j                  |j)                                dt        j                  j                  j                  |j+                                d|j,                   d|j.                   d
       nxt1        |t2              rZt        j                  j                  j5                  |j6                        }d|j8                  v rd	}|j                  |	 d
|        nt;        d|       |j=                  |	       A |j                  ddj?                  |       d       d d d        |j                  g d       g }g }d }t        j                  j@                  jB                  jD                  }|j	                         5  |j                  d| d       |j	                         5  |j                  d| d       | jF                  D ]  }tI        t        j                  j                  j5                  |jJ                              }|jL                  dk7  s| jN                  r|j=                  |       |jL                  dk7  s{|j=                  |        d| }|j                  | d| d       | jQ                         r#dj?                  tS        tT        |            dz   }nd}|j                  tU        tV        jX                         d| ddj?                  |       d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  d| d       |j	                         5  |j                  d| d       |j                  dtU        tV        jX                         d| ddj?                  |       d       d d d        d d d        t[        t]        | j                  j^                  ja                                     }|j                  g d       |j	                         5  |j                  d       |j                  d       |j                  d       |j                  d       |j                  d        |j                  d!| d"       |j                  d#       |j                  d$       d d d        |S # 1 sw Y   {xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   )xY w# 1 sw Y   .xY w# 1 sw Y   |S xY w)%N)r  r  zdef get_args():arg_z = rand_strided(rW   z
, device='z	', dtype=rG   r`  r   r  z*Don't find the buffer or const tensor for zreturn ,)
rQ  zdef call(args):zwith torch.cuda._DeviceGuard():ztorch.cuda.set_device(r  streamz = get_cuda_stream(r  z.run(*args, z
grid=grid(z
), stream=)rQ  rQ  z def benchmark_all_configs(args):z.benchmark_all_configs(*args, r  )rQ  rQ  zif __name__ == '__main__':z/from torch._inductor.utils import get_num_bytesz#from triton.testing import do_benchzargs = get_args()z:ms = do_bench(lambda: call(args), rep=40, fast_flush=True)z.num_gb = get_num_bytes(*args, num_in_out_args=z) / 1e9zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))1r6   rI   python_argdefsr  rK  r#  r$  rO  r  r0   r  
get_bufferr  r  
size_hintsget_size
get_stride
get_devicer  	constantsr  stridedevicer   r   r;   r  rN   r   KeyErrorr  r  r   current_devicer  r  pexprr  r  r%  r/  r  r}   r)   KERNEL_NAMErH   r-   r  r  )rM   r  argdefs	call_args	signaturename_cnt	var_namesarg_namearg_sigr   bufconst_tensorsymval_hintgrid
extra_argsextra_args_strr  r  rN   stream_nameninplace_argss                        rO   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmark  st   !(,		(@(@(B%I56]]_ (HI%(I%>!'!$x.!12gg((2$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  1"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC"DXJO    *5 &?6 wtyy';&<A>?= @ 	9:
!!0066]]_<UG2FG  ,UG15 !,,D !1!1!;!;DJJ!GHD{{c)T-B-B"))$/{{c)D) - !'ug.  K=0CE7!!LM'')%)YYs3
/C%Dt%KN%'N  ;2234L@PPZ[_[d[dei[jZkku  wB  vC  CD  E' ! 4 	JK]]_<UG2FG  ,UG15   c+"9"9:;;YZhYiistxt}t}  C  uD  tE  EG  H	 !  F499#<#<#C#C#EFGDE]]_NOBCR 01L @wW =>N " { _N ! _: ! _ " sk   J	Y4&Y(BY7B'YY(&Z7AY5Z7BZYY%	 Y((Y25Y?	:ZZZc                ,    t        j                  d      S )Nz
            from torch._dynamo.testing import rand_strided
            from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
            import torch
            from torch._inductor.triton_heuristics import grid
        )textwrapdedentr  s    rO   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernelE  s    
 	
rQ   c                   ddl m} t               }g }| j                  D ]p  }t        j
                  j                  j                  |      }t        |t        t        j                  f      sd}n |t        |            }|j                  |       r | j                  r| j                  sJ d}n!| j                  rd}n|j                          d}|D|j!                  d| d       t"        j$                  r|j!                  | j'                                | j(                  j+                         \  }	}
}t-        |      D ]  \  }}t        |t.              s|j0                  t        j
                  j                  j2                  v sHt/        |j4                  t        j
                  j                  j2                  |j0                           ||<    t7               }| j8                  D ]  }|| j(                  j:                  v r(|j=                  | j(                  j:                  |          || j(                  j>                  v r\|t        j
                  j@                  vr@|| j@                  vr2|j=                  | j(                  j>                  |   jB                         || j(                  jD                  v s|j=                  | j(                  jD                  |           tG        |      }tI        || jJ                  	      }|t        j
                  jL                  jN                  jP                  t        j
                  jL                  jN                  jR                  i d
}t7        | jT                        tW        tX        jZ                        |d}| j\                  D ]  }|j^                  dk7  s| j                  st/        |j^                   d|j`                        }|j                  |       tc        || jJ                  	      |te        |	      <   |	j                  |j^                   d        tg        |      g|d<   | j\                  D ]r  }|j^                  dk(  r| j                  r| j                  r+|j^                  dk(  r| jh                  rG|	j                  |j^                  jk                          d       t | j                  r| jl                  }d| d|d| d|d|d}nBd}te        |      dk(  rte        |      dk(  rd}nd}d| d|d| d|d|d| jn                   d}|j!                  |       |jq                  d|xs tW        tX        jr                         d dju                  |	       d!       | jw                          |jy                         5  | j{                  |       | j(                  j}                         D ]  \  }}|jq                  | d"|         |j!                  | j~                         d d d        t"        j$                  r|j!                  | j                                |j                         S # 1 sw Y   HxY w)#Nr   )r(   i    r  r(  	pointwisea!  
                    import triton
                    import triton.language as tl
                    from torch._inductor.ir import ReductionHint
                    from torch._inductor.ir import TileHint
                    from torch._inductor.triton_heuristics import AutotuneHint, z
                    from torch._inductor.utils import instance_descriptor
                    from torch._inductor import triton_helpers
                )
size_dtype)rc  r\  device_typerZ  )r,  kernel_namemutated_arg_namesr  r  configsr   zBLOCK : tl.constexprz
                @z!(
                    size_hints=z%,
                    reduction_hint=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            r  r   r   ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,rW   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r   rR  r  )Br   r(   r6   r"  r0   r  r  symbolic_hintr   r   r^   r  r  r  r%  popr  r   benchmark_kernelrt  rI   rT  	enumerater;   rN   inv_precomputed_replacementsr   r   r  input_buffersr  r  removed_buffers
inner_nameoutput_buffersr  r?   r  r   r^  r  r  r,  r}   r)   DESCRIPTIVE_NAMEr  r  r  r>   rH   r=   r  r  r  r   r  r`  r  rI  rK  codegen_static_numelsaliasesr	  rp  getvalue)rM   r   r(   r  rV  r  
numel_hintr  
heuristicsra  rd  rc  rE  r   mutated_argsmutationtriton_meta_signaturetriton_metainductor_metar  sizeargr  heuristics_line	tile_hintoldnews                             rO   codegen_kernelzTritonKernel.codegen_kernelO  s   *
[[E))77>Jj3*>? !	+C
O<	i(# !$ $$((((/J""$JNN$J<KKQ
 R\P\ ]
 &&D==?@ $		 8 8 :I	*FAs3(HH 0 0 M MM&HHagg..KKCHHU 	! + uH499222  !8!8!BCDII555AGG$;$;;D$8$88  !:!:8!D!O!OP499333  !9!9(!CD ' l+ 1$"2"2!
 /gg''66<<77,,;;@@	
 "$"5"56{;;<!-
 $$D{{c!T%:%:!T[[M"7D  )6B(8(87%c'l3 $++e45 % #,I"6!7I$$D{{c!))T-F-F{{c!dmmNNdkk//122FGH %   !00N#   *~ .$$2#3 4!!, 0##0"3 4	O I:!#y>Q& <I =I#   *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))G:L9MRP	
 	[[]&&t, II--/S#c#/0 0KK		"	  ""KK5578}} ]s   $A%YYc                   | j                   D ]e  }|j                  dk7  s| j                  r|t        j                  j
                  j                  |j                        }t        |t        j                  t        f      r)|j                  |j                   dt        |              |j                  dk(  r| j                  rt        j                  j
                  j                  |j                        }t        |t        j                  t        f      rt        |      }nt        |      }|j                  d|        |j                  dk(  sG| j                  sU|j                  d       h y)a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        rnumel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r  znumel = zRBLOCK: tl.constexpr = r   zXBLOCK: tl.constexpr = 1N)r  r  r%  r0   r  r  r!  r  r   r^   r  r   r  r  r(   r  )rM   r  r  simplified_tree_numelvals        rO   r  z"TritonKernel.codegen_static_numels  s   " $$D{{c!T%:%:()(8(8(A(A$**(M%3emmS5IJNNdkk](3?T;U:V#WX{{c!d&?&?()(8(8(A(A$**(M%3emmS5IJ34C%c*!8>?{{c!dmm9:! %rQ   c                    t        t        | j                              }| j                  d   dk(  }t	        | j
                        |z
  |z
  S )Nrs   r2   )r   rt   r  r"  rH   r  )rM   r  no_r_dims      rO   r   zTritonKernel.triton_tensor_ndim  sB    tDMM*+;;r?a'4##$x/(::rQ   c                    t        t        | j                              }dg| j                         z  }|
||z
  }d||<   ddj	                  |       dS )Nr  r  r  rW   r  )r   rt   r  r   r  )rM   rE  r   r  r  idxs         rO   r  zTritonKernel.indexing_size_str  sZ    tDMM*+42244=h,CE#J499U#$A&&rQ   c                   g }| j                   D ]  }| j                  r|j                  dk(  r|j                  dk7  s| j                  r-|j	                  |j                  j                          d       g|j                  dk(  sw|j                  dk7  s|j	                  d        |dd g dk(  rt        |dd       |dd |dd	 d
dgk(  rt        |dd	       |dd	 ddj                  |       dS )Nr   r  r  r2   1r   r   )ZBLOCKYBLOCKXBLOCKr   r  r  r  rW   r  )	r  r  r  r%  r  r  r  r  r  )rM   r  r  s      rO   r  zTritonKernel.dense_size_str  s    $$D}}!3{{c!T%:%: 1 1 34E:;#

aS! % 1:77!%!*-E!AJ1:(H--!%!*-E!AJ499U#$A&&rQ   c                   t         j                  j                  }| j                  j	                         \  }}}t        t        |            D ]0  }t         j                  j                  ||         s&||   dz   ||<   2 g }| j                  D ]  }t        |j                  t        j                  t        j                  f      r|j                  }	n|j                  ||      }	|j                  dk7  s| j                   r|j#                  |	       |j                  dk7  s|j#                  |	        |j%                  ||      }|j'                  |||t         j                  j(                  j*                  j,                  dd       y )Nz.item()r  T)cudar   )r0   r  wrapper_coderI   rT  rB  rH   r  r  r   r  r^   r  r   generate_numel_exprr  r%  r  generate_default_gridgenerate_kernel_callr   r^  r  )
rM   r   r  wrapperrd  rb  rE  rk  r  rN   s
             rO   call_kernelzTritonKernel.call_kernel1  s:   ''&&))2249as9~&Aww$$Yq\2(|i7	! ' $$D$**u}}ell&CDzz224>{{c!T%:%:  &{{c!D! % ,,T48$$GG,,22 	% 	
rQ   c                <   t         j                  sy t        j                  j                  }| j
                  j                         \  }}}t        ||      D ]D  \  }}t        |t              sd| d}|j                  |       d| d}|j                  |       F y )Nzassert not z.isnan().any().item()z.isinf().any().item())r   nan_assertsr0   r  r  rI   rT  rO  r   r<   r  )rM   r  rd  rb  	arg_typesr   arg_typer  s           rO   codegen_nan_checkzTritonKernel.codegen_nan_checkO  s    !!''&&"&))":":"<9i I6MC(I.$SE)>?!!$'$SE)>?!!$' 7rQ   c           	        t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}d}|D ]3  }t        j                  j                  |      }|s&t        |j                  j                        dk(  sIt        |j                  j                  D cg c]
  }|dk(  s	| c}      dk(  rt        j                  |j                  j                        }	||	}||	k7  st        d| dd|	 d	| z         }
t        j!                  |
       |D cg c]i  }t        j                  j                  |      rFt        j                  t        j                  j                  |      j                  j                        ndk }}|D cg c]V  }t        j                  j                  |      r3t        j                  j                  |      j                  j                  ndX }}|D cg c]@  }|t        j                  j"                  v rd
n|t        j                  j$                  v rdndB }}t        d| d| d| d| d| dz         }
t        j!                  |
        y t'        d| d      }
t        j!                  |
       yc c}w c c}w c c}w c c}w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r2   r   Nr   r   zExpected stride order z, but found stride orderrh  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources rQ  z%All the inputs for the triton kernel z have uniform layout)rH   rI   r  r  r  rT  r0   r  rU  layoutr  r   get_stride_orderr[  r.   ru  warninggraph_inputsname_to_bufferr&   )rM   ry  ra  rb  rc  uniform_stride_orderrf  rh  r   stride_ordermsgr   stride_order_list	size_listsource_lists                  rO   warn_mix_layoutzTritonKernel.warn_mix_layout\  s    		''(A-DII,,-2DII--.!3
 (,		(@(@(B%I#!H''$$X.Cs3::??+q03::??=?aa1f?=>!C!223::3D3DE'/+7()\9%01E0FF^_l^<}EFC KK$ %.	) %.D 77--d3 ++AGG,>,>t,D,K,K,R,RS!" %.	 & ) %.	! %.D 77--d3 **4077<<!" %.	  ! %.# %.D	  177#7#77 %  177#9#99 2!	"
 %.   # &(		{,WhVij&ykk]"MNC KK$U "V 3K=@TU
 	CU >)!#s!   0
K&
;K&
/A.K+#AK0AK5c                    t        |i |S r   )r   )rM   rI   r   s      rO   create_cse_varzTritonKernel.create_cse_var  s     $1&11rQ   )r  r}   r  zOptional[Set[str]]r   rt   )r-  zIterable[sympy.Expr]r  List[List[sympy.Expr]])r  r  r  )r  r  r  r  )r  r  r   r}   )rN   r  )r   r}   )r   r}   r  r  r   )
r  r3   r  r}   r  r  r  r   r  rt   )NN)r   r}   r  zOptional[IRNode])9rm   rn   ro   r   	overridesr_  sexprr   DEFAULTr   r/  r)  r;  r+  rM  rQ  r  rr  r  rt  r|  r  r  r  r  r   r  r  r  rK  rL  r  r  r  propertyr  r  r  r  r  r  r  r(  rI  rI  rp  rt  r  r  r   r  r  r  r  r  r  r   r   s   @rO   r  r  )  s   IE )-$,,/3 /3 &	/3b*
Q(,
 @1$@1/E@1 @1D )4J O:4
,$	I `:`:D8$
  $ $L " "0Yv.6$$ $ !	$
 $$ $L.  
^@

*XbH
]~!;F;
''$
<(?B2rQ   r  c                  "   e Zd Zd Zd Zd ZeZeZd Zd Z	e
d        Ze
	 	 	 	 	 	 dd       Ze
d        Zd	 Zd
 Zd Zd Zd Zd Zd Zd Ze
 ej.                  d      d               Ze ej6                  d      fd       Zd ZddZd Zy)TritonSchedulingc                    || _         y r   )r   )rM   r   s     rO   r   zTritonScheduling.__init__  s	    "rQ   c                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wr   rx  r	  s     rO   r  z,TritonScheduling.group_fn.<locals>.<genexpr>  s+     P%QQWW%%..}Q/?@%s   <>)r  )rM   r  s     rO   group_fnzTritonScheduling.group_fn  s    P%PPPrQ   c                :   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }}|j                  \  }\  t        ||      }|j                         r,|j                         r|k(  xr |k(  }|s |d||       |S |j                         sC|j                         s2|k(  r|k(  s |d||       y|j                         r&t        |j                  t              }|s |d       |S | j                  |j                         ||      }	| j                  |j                         ||      }
| j                  |j                         |j                         z   ||      }t        j                  j                  rVd}t        |	      dkD  r%t        |
      dkD  r|	|
cxk(  xr |k(  nc }n|	|k(  }nt        |
      dkD  r|
|k(  }|s |d|	|
|       yy|j                         s|j                         r|dk(  rdk7  sJ |z  k(  rt!        fd	|j                         D              s	 |d
       yt        j                  j"                  rE|j                         s5| j                  |j                         |      |dfdffv }|s |d       |S y|k7  r |d       |k(  S |j                         r|j                         rJ | j%                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz!node1 is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r2   c              3  j   K   | ]*  }t         j                  f|j                                , y wr   )r  rt  
get_ranges)r
  r  numel2rnumel2s     rO   r  z,TritonScheduling.can_fuse.<locals>.<genexpr>  s1      . !../@!,,.Q.s   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fusegroupr!   is_reductionis_templater  r   select_tiling	get_nodesr   r    tiling_prevents_pointwise_fusionrH   r   tiling_prevents_reduction_fusioncan_fuse_horizontal)rM   node1node2rd  numel1rnumel1whyreduction_can_fuseis_triton_templatetiling1tiling2tiling3r  is_reduction_tiling_validr  r  s                 @@rO   r  zTritonScheduling.can_fuse  s*    eYAABj977G
 77@@NN${{FG${{FGu%E$6$6$8!'6!1!Hg6H%G &%!!#E,>,>,@f$G);K   " &0

<P%Q");<)) (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/040B0B)61  !,1- 5:;4412V##!!#E,>,>,@@@''u55rQ   c           
       	
 g t               t               t               fd	fd}t        j                  	
fd       }t              D ]  \  
v rj	                         fd} 	      rj |      r |       5  	 d d d        j	                  j                                j	                  j                                j                          |      r" |       5  j                         d d d        t        d d dj                  d           S # 1 sw Y   xY w# 1 sw Y   xY w)	Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S r  r  r  rd  
node_numelnode_rnumelr  r7  s       rO   fits_in_main_bodyzBTritonScheduling.generate_node_schedule.<locals>.fits_in_main_body	  sH    +,77(A(
K%'AK6,A efn,A1ArQ   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S r  r  r  s       rO   fits_outside_reductionzGTritonScheduling.generate_node_schedule.<locals>.fits_outside_reduction	  s4    +,77(A(
K&K;!+;K!KrQ   c               3     K   rdz   d  D ]  } vs |       s| j                   z  r!j                         j                  j                                j                  j                                j	                          rd   t
        u rj                          nj	                  t               d  j	                  t
               j                          j                          y w)Nr2   rs   )		ancestorsr  get_namer  r  r8  r}  DisableReductionrM  )	
other_nodecurrent_loop_writesdoner  r  is_current_reductionsr  node_scheduler  s	    rO   end_current_reduction_loopzKTritonScheduling.generate_node_schedule.<locals>.end_current_reduction_loop#	  s     ""'	"4JD(-j9!4z7K7K!K+//@-11$2C2C2EF%,,T2 #5 r!2o!E!!#$$%56  1%%'!'')s   C>C>C>CC>c                r    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ dv S )Nr2   Frs   T)r  r   r8  r  )r  r  r  r  r7  s     rO   #requires_closing_previous_reductionzTTritonScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction@	  sN    Q; *T^^; $Z!"%9I'J.   444rQ   zunexpected group: (rW   z) != r2   )
r   rK  rL  r  r  r  r  r  r  r  )rM   r  r  r7  r  r  r  r  r  r  r  r  r  r  s    ```   @@@@@@@rO   generate_node_schedulez'TritonScheduling.generate_node_schedule	  s?   #%(+ #u		L 
	"	"	* 	* 
#	*. %U+KE4t|HHTN5 !&6t]K35 6#''8%))$*;*;*=>$$T*'-/1!((. 21 *)%6(%

1O 5 ,<  65 21s   EEE	E	c                    t        |d       j                  \  }\  }}| j                  |||      }t        j	                  d|       | j                  |||      S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                4    t        | j                               S r   r   r  r   s    rO   r  z0TritonScheduling.codegen_nodes.<locals>.<lambda>_	      c!..:J6KrQ   r  zSchedule:
 %s)r   r  r  schedule_logdebugcodegen_node_schedule)rM   r  rd  r  r7  r  s         rO   codegen_nodeszTritonScheduling.codegen_nodes[	  sZ     !,KLRR?E633E5&I+];))-GGrQ   c                "   | j                         sJ t        d t        j                  | j                  j
                  | j                  j                        D              rt        j                  S | j                  j                  j                  S )Nc              3  <   K   | ]  }|j                           y wr   )is_contiguousr
  deps     rO   r  z2TritonScheduling.reduction_hint.<locals>.<genexpr>j	  s!      
W Ws   )r  r  r#  r9  read_writesreadswritesr   r*  r  datar  )r  s    rO   r  zTritonScheduling.reduction_hintg	  sn      """ 
 t'7'7'='=t?O?O?V?VW
 
 !&&&99>>000rQ   c                   t        j                  t         j                        j                  t        j
                  j                  j                  t        j
                  j                  j                  j                  fd |       sy|D cg c]H  }t        |j                         t        j                        s|j                         j                         J }}t        fd|D              syt        j
                  j                  j!                  |        |D ],  }t        j
                  j                  j!                  |       . yc c}w )Nc                    t         j                  j                  j                  | k        ry |       xr  |       k  S )NT)r0   r  r  is_expr_static_and_true)r  has_hintint_maxr  s    rO   within_32bitz=TritonScheduling.can_use_32bit_indexing.<locals>.within_32bitz	  s<     ww77WEA;:9Q<7#::rQ   Fc              3  .   K   | ]  } |        y wr   rp   )r
  r  r  s     rO   r  z:TritonScheduling.can_use_32bit_indexing.<locals>.<genexpr>	  s     <)$<%)s   T)r   r,  r  r   r0   r  r  r  	shape_envr  r   
get_layoutr   MultiOutputLayoutstorage_sizer  r2  )	r  buffersrh  	buf_sizesr  r  r  r  r  s	        @@@@rO   can_use_32bit_indexingz'TritonScheduling.can_use_32bit_indexingr	  s    ++ekk*..GG$$..	77##--66	; E" 
cnn.0D0DE NN))+ 	 
 <)<< 	
""5'2DGG&&tW5 
s   AEc                L   t               }| D ][  }t        |t        j                        s|j	                  |j                                |j	                  |j                                ] dd}|D cg c]
  } ||       }}||z  }t        j                  ||      ryyc c}w )Nc           	     ,   | t         j                  j                  v rt         j                  j                  |    S | t         j                  j                  v rt         j                  j                  |    S | t         j                  j                  v ryt         j                  j                  |    }t        j                  | t        j                  |j                  |j                  gt         j                  j                  |             S t        d|        )Nz$Failed to find buffer matching name )r0   r  r  r  rZ  r   ConstantBufferFixedLayoutr\  r   static_sizes_stridesRuntimeError)r   r  s     rO   _get_bufferz8TritonScheduling.select_index_dtype.<locals>._get_buffer	  s    qww---ww--d33---ww++D11***ww((.((NNTZZ23''2N2Nt2T  !EdVLMMrQ   r  r  )r   r}   r   zUnion[ir.Buffer, ir.TensorBox])	r   r   r   BaseSchedulerNoder   	get_namesused_buffer_namesr  r  )	r  r  reduction_numelbuffer_namesr  r!  r   r  total_numels	            rO   select_index_dtypez#TritonScheduling.select_index_dtype	  s     u!DdI$?$?@ 01 6 6 89 "	N 2>>;t$>
 o-22;H ?s   3B!c                   t        t        d |            }t        |      dkD  rU|D cg c]  }| j                  |       }}|j	                  |d         t        |      k(  r|d   }n!t
        j                  }nt
        j                  }t               }|D ].  }	t        |	d      s|j                  |	j                                0 | j                  |||      }
|||
fS c c}w )Nc                B    | t         t        fvxr | j                         S r   )r8  r  r  r  s    rO   r  z2TritonScheduling.get_kernel_args.<locals>.<lambda>	  s#    !O5E#FF %NN$%rQ   r   get_mutations)r  filterrH   r  r$  r   r  r   hasattrr   r,  r(  )rM   r  r  r%  
reductionsr  hintsreduction_hint_valr  r  r  s              rO   get_kernel_argsz TritonScheduling.get_kernel_args	  s    %

 z?Q5?@ZT((+ZE@{{58$E
2%*1X"%2%:%:"!.!6!6E	!Dt_-  !3!3!56 " --mUOT!9k99 As   Cc                   t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rvddlm}m	 t        fd|D              sY|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              y y y c c}w )Nr   )r"  r  c              3  6   K   | ]  }t        |        y wr   )r   )r
  r  r  s     rO   r  z3TritonScheduling.codegen_comment.<locals>.<genexpr>	  s      CPa
189=s   z Fused node name list: rW   )r0   r  r  r%   r  r   debug_fusiontorch._inductor.schedulerr"  r  r  r   r  commentr  )	rM   r  r  originsdetailed_originsr"  r  
node_namesr  s	           @rO   codegen_commentz TritonScheduling.codegen_comment	  s    ''&&$7w$O!!g&
  CP  +*!!%67 JJL*  
 !!''>tyy?T>UV s   .#Cc                   | j                  |||      }| j                  |||      \  }}}t        ||||d}| j                  ||       t	        j
                  |      5  |j                         }	|D ]!  }
|
t        t        fvs|
j                          # 	 d d d        | j                  	|      }t        j                  d|       | j                  |       |j                  |       |j                          t        j                   xj"                  |j"                  z  c_        t        j                   xj$                  |j$                  z  c_        t&        j(                  r|j)                  |       t        j                   j*                  j,                  rt&        j.                  r|j0                  j3                         }|D ]  }
t5        |
t6        j8                        s|
j;                         }||vr3|
j<                  j?                         }|Pt@        d   dxx   dz  cc<   t        j                   j*                  jC                  d|jD                  d| d        | j6                  jG                          y # 1 sw Y   xY w)	Nr  r  r  z+Generating kernel code with kernel_name: %sinductorintermediate_hooksr2   zrun_intermediate_hooks(rW   rG   )$r  r2  r  !codegen_node_schedule_with_kernelr0   set_kernel_handlerr  r8  r  mark_rundefine_kernelru  r  r;  r  r  r  r  inplaced_to_remover   r  r  supports_intermediate_hooksgenerate_intermediate_hooksrI   live_output_buffersr   r   r"  r  r  get_origin_noder   r  r   free_buffers)rM   r  r  r%  tiled_groupsr1  r  r  r   src_coder  ry  	live_outsr   origin_nodes                  rO   r  z&TritonScheduling.codegen_node_schedule	  s   ))-P595I5I5/6
2I{ -#	
 	..}fE!!&),,.H%1ABBMMO & * ((=A		?M]+;'  "	6#9#99	""f&?&??"!!"";/ GG  <<22 779I%!$	(C(CD}}y("ii779*Z()=>!C>GG((221+2B2B1ERvQO & 	##%M *)s   $I?I??J	c           	     B   d }|5  t        j                         }|j                   ||             |D ]!  }|t        t        fvs|j                          # t        |      D ]  \  }}|t        u r |j                  |j                                .|t        u r+|j                          |j                   |||d               at        |j                         |j                  |j                               }|j                  |        	 d d d        y # 1 sw Y   y xY w)Nc                0    t        j                  d |       S )Nc                    | t         uS r   )r  r+  s    rO   r  zeTritonScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes.<locals>.<lambda>*
  s
    :J1JrQ   )r#  	takewhile)r  s    rO   current_reduction_nodeszSTritonScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes)
  s    &&'JERRrQ   )rK  	ExitStackr;  r8  r  decide_inplace_updater  enter_contextrM  closer   _bodyr|  r  r  )rM   r  r   rR  stackr  rE  r  s           rO   r@  z2TritonScheduling.codegen_node_schedule_with_kernel(
  s    	S ((*E!!"9-"HI%1ABB..0 & %]34++''(@(@(BC_,KKM))*A-PQPRBS*TU 6djjA!'!<!<T__=N!OJLL, 4 VVs   ?DCDDc                   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }||j                  |<   t        j
                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                         |      }|j                  dd      }t#        t%        |j'                               d      \  }}	}
t)               }|j+                  d	|d
       |j-                  |d       |j+                  d       d|
 }t/        ||      \  }}|d|z   dz   |z   z  }|j1                  ||j3                         |       |S )Nr  r   rd  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''T)stripz''')z# kernel path: rQ  )r0   r  r  src_to_kernelr   r   descriptive_namesr$   r1   r  next_kernel_suffixunique_kernel_namesreplacer}   r)   r  r`  r   r   r]  r6   r  r  r%   rC  r  )rM   rK  r  r  ry  
fused_namekernel_category	subs_namebasenamerd  kernel_pathcompile_wrappermetadata_commentr8  r9  s                  rO   rC  zTritonScheduling.define_kernel?
  s   ''&&w,,,!//9KL E ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H'/	(..:J0KT'R$Ha,.O%%(=i]%&PQ""84"8%%f-!0>(;M7(S%G%w 58H HH!!_5579I rQ   c                   |j                   \  }\  }}|dk(  sJ |j                  j                  |j                        \  }}|5  |g|D ]  }|j                            |       }	|D ]0  }|j	                  |j                  |j                                      2 	 ddd       t        j                  |      5  t        	t              r|	n|	j                         }
|g|}t        j                  r4|j                          d|
 d|j                         j!                          }
| j#                  |
|      }ddd       | j%                         |j'                  |j                         t        j(                  xj*                  |j*                  z  c_        t        j(                  xj,                  |j,                  z  c_        | j.                  j1                          y# 1 sw Y   GxY w# 1 sw Y   xY w)z+
        Codegen a triton template
        r2   NrQ  )r  r  make_kernel_renderrB  r  r|  r  r0   rA  r   r}   finalizer   r~  rt  rp  r  rC  r;  r  r  r  rD  r   rI  )rM   template_nodeepilogue_nodesrd  r  r7  r   renderr  partial_coderK  r  ry  s                rO   codegen_templatez!TritonScheduling.codegen_templatej
  s    +00?E6{{&++>>}?Q?QR&88 9!8L&V889JKL '	  !!&) lC0 !**, 
 +<^<M&&$AACDBxjPRSYSrSrStS}S}S  SA  B,,X}EK * 	]+;(:(:;	6#9#99	""f&?&??"##%3 V *)s   AG!9A>G.!G+.G7c                V    t         j                  j                  j                  d       y )Nztorch.cuda.synchronize())r0   r  r  r  r  s    rO   codegen_synczTritonScheduling.codegen_sync
  s    	&&'ABrQ   c           	     l   ddl m} |j                  |j                         |       D ]d  } |       }|D ]  \  }}}}| j	                  |||      }	| j                  |	||      \  }
}} |j                  ||
||d}| j                  |	|       t        j                  |      5  |	D ]!  }|t        t        fvs|j                          # 	 d d d        t        j                  xj                  |j                  z  c_        t        j                  xj                  |j                  z  c_         |j!                         }| j#                  ||g      }| j%                  |g       |j'                  t        j                  j(                  |       g | j*                  j-                          y # 1 sw Y   xY w)Nr2   )ForeachKernelr=  )triton_foreachru  horizontal_partitionget_subkernel_nodesr  r2  create_sub_kernelr@  r0   rA  r8  r  rB  r  r  rD  r  rC  r;  r  r  r   rI  )rM   foreach_noderu  partitions_with_metadatar   r  rJ  r  r7  r  r1  r  r  	subkernelr  rK  ry  s                    rO   codegen_foreachz TritonScheduling.codegen_foreach
  s   1(5(J(J,,.)
$ #_F6N2|UF $ ; ;E5& Q
 ((vF	& 5F44!#5' +		 66!
 )))4 -9I'JJ MMO !. 5 ''9+D+DD'**i.J.JJ*3 7O6 ,,.H,,X~FK  ,0qww33[AE)
H 	##% 54s   !F*6F**F3r   c           
     b   | j                         \  }}t        |      dk  ry| j                         }t        |j                        t        |      k(  sJ |j                  |j
                  g}t        d t        j                  | D              sJ t        j                  | D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}|j
                  D ch c]  }|j                   }}g }|D ]  }t        j                  j                  j!                  |j"                  |j                        }	t        |	      t        |      k(  sJ 	 |	j#                  d      dz   }
|
t        |      k(  rt        d |	|
d  D              r	 t        j                  j                  j'                  t)        |d |
             t        j                  j                  j'                  t)        ||
d              f}t        j                  j                  j+                  t)        d t-        ||	      D                    }|j                  |v r|dz  }t.        j1                  |d         r|dz  }t.        j1                  |d         r|dz  }t        j                  j                  j+                  |t)        t        j                  ||            z
        dk\  s|j3                  t/        |||j                                |S c c}w c c}w # t$        $ r Y w xY w)Nr2   rp   c              3  H   K   | ]  }t        |t        t        f        y wr   )r   r   r   r  s     rO   r  z5TritonScheduling.candidate_tilings.<locals>.<genexpr>
  s$      
4 sY014s    "c              3  &   K   | ]	  }|d k(    ywrw  rp   r	  s     rO   r  z5TritonScheduling.candidate_tilings.<locals>.<genexpr>
  s     7!qAvr  c              3  2   K   | ]  \  }}|d k7  s|  ywrw  rp   )r
  r  r[  s      rO   r  z5TritonScheduling.candidate_tilings.<locals>.<genexpr>
  s      -A\T6Vq[D-As   r   r   )r  rH   pointwise_read_writes
range_varsr
  r  r  r#  r9  r   r0   r  r  r   r   r  stride_hintsr  
ValueErrorr!  r*   r  rO  CandidateTilingis_good_sizer  )r  rP  reduction_rangesrwdep_sourcesr  depswrite_namestilingsr  r~   rJ  scores                rO   candidate_tilingsz"TritonScheduling.candidate_tilings
  s    $(??#4  v;!'')2==!S[000 xx+ 
 4
 
 	
 
 !4
4xxqww666:c9;U 4 	 

 ,.9959Csxx95)+Cgg&&33CIIr}}MGw<3v;...
a(1,CK'7wuv77  8   ))-v*GH  ))-uv*GHL
 GG$$.. -0-A E
 xx;&
++LO<
++LO<
   **M)//&BR*STT 
 |UCHHMNM N a

 6   s$   &?L5L*"L!L!!	L.-L.r2   c                   |dk7  st         j                  j                  dk  rvt        j                  t
        j                  k  rQt        j                  |      D ]9  }t        | j                  |            dkD  s!t        j                  d        ||fS  ||fS t               }t        j                         }t        j                  |      D ]c  }| j                  |      D ]M  }|j                  |v r|j!                  |j                         ||j"                  xx   |j$                  z  cc<   O e |j'                         D cg c]  \  }}|	 }	}}t         j                  j                  dk\  rt)        dt        |	            D ]  }
|	d   \  }}|	|
   \  }}t*        j,                  j.                  j1                  ||z
        dk(  rCt*        j,                  j.                  j1                  ||z
        dk  r|	|
   \  }}|	d   \  }}t*        j,                  j.                  j1                  ||z
        dkD  sJ t*        j,                  j.                  j3                  ||      s|t5        ||      |f}|g|	z   }	 n t        |	      dkD  rt        j                  d|	       |	D ]!  }g ||t7        fd|D              sc S  ||fS c c}}w )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r2   r   z"reduction over non-contiguous dimsr   zpossibly bad tiling: %sc              3     K   | ]B  }t        |t        j                        r&t        j	                  |j                                D y wr   )r   r   SchedulerNoder  rt  r  )r
  r  
new_groupss     rO   r  z1TritonScheduling.select_tiling.<locals>.<genexpr>8  s=      )DdI$;$;< **:t7HI)s   AA)r   r   	max_tilesperf_hint_loglevelloggingWARNINGr8  r-  rH   r  infor   collectionsr   r   r  tilingr  most_commonrB  r0   r  r  r  rT  r   r  )r   r  r  r%  r  
seen_namescandidate_tilesr  r  ranked_tilingsrE  a0a1b0b1rJ  r  s                   @rO   r  zTritonScheduling.select_tiling
  s    a6==#:#:a#? ""goo5+22=AD30067!;%**+OP?++	 B ?++U
(3(;(;(=#**=9D//5;;*,v{{+.&,,>.	 6 : 7F6Q6Q6ST6S]VU&6ST==""a' 1c.12'*B'*B77##--b2g6!;77##--b2g6:+A.FB+A.FBww''11"r':Q>>>77##@@RH (2r"2B7F&,X%>N 3 ~"8.I*L9<99J ) 
 "! + ''K Us   Kc                     y r   rp   r  s    rO   flushzTritonScheduling.flushA  s    rQ   c                     y)NFrp   r  s    rO   ready_to_flushzTritonScheduling.ready_to_flushD  s    rQ   c                   t        |d       j                  \  }\  }}| j                  |||      }| j                  |||      }| j	                  |||      \  }}}	t        ||||	d}
|D ]  }t               |_         | j                  ||
       t        j                  dd      5  t        j                  |
      5  |
j                         }d d d        d d d        j                  t        t         j"                        d      }t%        j&                  |      fdfd}fd	}t(        j+                  d
|D ch c]  }|j-                          c}j.                          |       j.                  fS j1                         j2                  j4                    j6                   d          j8                  }t;        |      dk(  sJ |d   j<                  dkD  rt?        d      ntA        fd      t(        j+                  d|D ch c]  }|j-                          c}        |        j.                  fS # 1 sw Y   xY w# 1 sw Y   xY wc c}w c c}w )Nc                4    t        | j                               S r   r  r   s    rO   r  z8TritonScheduling.benchmark_fused_nodes.<locals>.<lambda>H  r  rQ   r  r=  r~  TrZ  c                 ~     j                   J t        j                  j                   j                         d   dz   S )Nr   z.kernel_perf)__file__ospathsplitext)mods   rO   cache_file_pathz?TritonScheduling.benchmark_fused_nodes.<locals>.cache_file_patha  s6    <<+++77##CLL1!4~EErQ   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wr   )r  r  existsopenr   read)r  fdr  s     rO   
load_cachez:TritonScheduling.benchmark_fused_nodes.<locals>.load_cachee  sE    "$Dww~~d#$Z2 +  Z  s   AA c                             } t        | d      5 }|j                  t                     d d d        y # 1 sw Y   y xY w)Nw)r  writer}   )r  r  r  mss     rO   store_cachez;TritonScheduling.benchmark_fused_nodes.<locals>.store_cachel  s0    "$DdCBR! !s	   9Az%kernel src code for %s written to: %sr   r2   r   c                 4      j                     d         S rc  )
clone_args)rI   callwrapped_jit_functions   rO   r  z8TritonScheduling.benchmark_fused_nodes.<locals>.<lambda>  s    $'F';'F'F'Ma'P"QrQ   z+The fused kernel for %s took %.3f ms to run)!r   r  r  r  r2  r  r   r(  r@  r   patchr0   rA  r  rb  r}   r)   r`  r   r  ru  r  r  r  get_argsr  rZ  r  	launchersrH   n_spillsr   r#   )rM   r  rd  r  r7  r  rJ  r1  r  r  r   r  rK  r  r  r  rI   r  r  r  r  r  s                   @@@@@@rO   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodesG  s@    ,KLRR?E633E5&I))-G595I5I5&6
2I{ -#	
 A5AL  	..}fE\\,d3Q5I5I&5Q,,.H 6R3 ##C(?(?$@)Lx(	F		"
 			3#()5aQZZ\5)LL	

 \>s||##||~xx"{{ 	,!,,d3A67(22	9~"""Q<  1$uB QRB		9#()5aQZZ\5)	

 	3<<m 6R5Q330 *4 *s0   -I%II%I2
$I7
I"	I%%I/N)r  r  r  z(Iterable[Union[ir.Buffer, ir.TensorBox]]r   rt   r  ) rm   rn   ro   r   r  r  can_fuse_verticalr  r  r  r  r  r  r(  r2  r;  r  r@  rC  rq  rs  r}  r  r  r  r  r^   r  r  r  r  r  rp   rQ   rO   r  r    s   #Qf6P !"FP
H 1 1 ""$L"	" "H # #J:445&n-.)V &DC'&R YA  AF ANqAQ A( A(FJ rQ   r  c                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r  zTuple[sympy.Expr, sympy.Expr]r  r   r  NzOptional[str]r   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr   r   rg  r  s    rO   r  zCandidateTiling.is_good_size  s5     GG&&q)Bw(AFaK(rQ   )rm   rn   ro   __annotations__r   r  r  rp   rQ   rO   r  r    s)    ))JD-) )rQ   r  c                      e Zd ZdZy)r  z
    Marker to invoke `kernel.disable_reduction()`.  This closes a
    reduction loop and allows for pointwise ops to occur on the output
    of a reduction.
    N)rm   rn   ro   r  rp   rQ   rO   r  r    s    rQ   r  c                       e Zd ZdZed        Zy)r8  z1
    Marker to end a DisableReduction block.
    c              #  Z   K   d}| D ]   }|t         t        fv r	|t        u }|r| " yw)zf
        Get the nodes from node_schedule skipping those in a
        DisableReduction block.
        FN)r8  r  )r  disabledr  s      rO   r-  zEnableReduction.filter  s;      !D)9::#33
 "s   )+N)rm   rn   ro   r  r  r-  rp   rQ   rO   r8  r8    s      rQ   r8  c                      e Zd Zy)rU  N)rm   rn   ro   rp   rQ   rO   rU  rU    s    rQ   rU  )s
__future__r   r  rK  dataclassesr  r#  r  r   ri  r  rr  typingr   r   r   r   r   r	   r
   r   r   r^   r   torch._loggingtorch._prims_commonr   torch.utils._sympy.functionsr   r   torch.utils._sympy.value_rangesr   _dynamo.utilsr   r  r   r   r   	codecacher   r   r   dependenciesr   r   r   r   r   optimize_indexingr   r    r!   triton_heuristicsr"   utilsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   virtualizedr/   r0   wrapper_benchmarkr1   commonr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   triton_utilsr=   r>   r?   	getLoggerrm   ru  _logginggetArtifactLoggerr  r   
fusion_logrD   rX   r  r_  r   r   r   r   r   	dataclassr  r  r  r  r  r  r  r8  	ExceptionrU  rp   rQ   rO   <module>r     s   "         	  R R R    0 B 7 % $ $ 8 8 - < < A 1 ,    ! B   E Dg!00<H~~//*E^^--hA
$6M $6N 	$&> >.P$k P$f &K &K &KRC</ C<L>'? >'Bu26 u2p+p ~ p f 	) 	) 	)  ,		 	rQ   