
    Ph|                     j   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z	d dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlZd dlmc mZ d dlmZ d dlm Z  d dl!m"Z"m#Z# ddl$m%Z% dd	l&m'Z'm(Z( dd
l)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5  ejl                  e7      Z8 e#       rd dl9Z9d dl9m:Z: d dl;m<Z< d dl=m>Z> ne?Z:dZ9e?Z>e?Z< e"       rd dl=m@Z@ ndZ@dZA G d de      ZB G d de      ZCdeeC   deDdee:   fdZEd ZF G d de>      ZGd ZHg ZIee   eJd<   d ZKd  ZL G d! d"eG      ZMd#ee:   fd$ZNd%eOd&eOd#ee:   fd'ZP	 	 d7d(eeeD      d#ee:   fd)ZQd#ee:   fd*ZRdddd+d,ZS	 	 	 	 	 d8de:fd-ZTd9de:fd.ZUd:d/ZV	 	 	 	 d;d0ZW	 	 	 	 d<d1ZX	 	 	 	 d<d2ZYd7d3ZZd7d4Z[d7d5Z\d6 Z]y)=    N)autoEnum)AnyCallableDictListOptionalSetTuple)get_interface_for_device)dynamo_timed)
has_tritonhas_triton_package   )config)	cache_dirCudaKernelParamCache)CoordescTuner)ReductionHintTileHint)ceildivconditional_productcreate_bandwidth_info_strdo_benchget_num_bytesnext_power_of_2triton_config_to_hashable)Config)OutOfResources)KernelInterface)get_cuda_stream    c                   R    e Zd Z e       Z e       Z e       Z e       Z e       Zy)HeuristicTypeN)	__name__
__module____qualname__r   	POINTWISE	REDUCTIONPERSISTENT_REDUCTIONTEMPLATEUSER_AUTOTUNE     lC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/triton_heuristics.pyr$   r$   >   s&    II6vHFMr.   r$   c                   (    e Zd ZdZej
                  Zy)AutotuneHintr   N)r%   r&   r'   ELEMENTS_PER_WARP_32r   __str____repr__r-   r.   r/   r1   r1   F   s     ||Hr.   r1   hints
block_sizereturnc           	      B   g }| D ]  }|t         j                  k(  st        |      dk(  r
|dz  ddff}nDt        |      dk(  r|dz  ddfd|dz  dff}n%t        |      dk(  r|dz  ddfd|dz  dfdd|dz  ff}D ]   }|j                  t	        |g|ddi       "  |S )a  
    AutotuneHints can be attached to the metadata of triton kernels for providing
    suggestions about what to try for autotuning. One reason to do this is if there are
    some configs that are only useful in specific scenarios, in which case we can avoid
    wasting compile time on autotuning unless we know we are in one of those scenarios.

    Based on those hints, this function will generate a list of additional autotuning
    configs to try.
    r      N      num_elements_per_warpr"   )r1   r2   lenappendtriton_config)r5   
size_hintsr6   configshintxyz_optionsxyzs          r/   autotune_hints_to_configsrE   P   s     G<444:!# *at<>ZA% *aD9AzQPT;UVZA%1_a+
a+:?+
 #!" /1 # * Nr.   c                  b    t        j                         ryt        j                  j                   S )NT)torch$are_deterministic_algorithms_enabledr   tritonautotune_pointwiser-   r.   r/   disable_pointwise_autotuningrK   w   s&     113}}////r.   c                        e Zd ZdZ	 	 d fd	ZddZdedee   fdZ	d Z
deee   eeef   f   fd	Zed
        Zd Zd Zd Zd Z xZS )CachingAutotunera	  
    Simplified version of Triton autotuner that has no invalidation
    key and caches the best config to disk to improve cold start times.
    Unlike the main triton Autotuner, this version can precompile all
    configs, and does not rely on the Triton JIT.
    c	           
         t         
|           || _        || _        |i n|| _        || _        || _        || _        || _        t        j                  t        j                        rOt        j                  dt        | j                               | j                  D ]  }	t        j                  |	        g | _        t!        j"                         | _        t'        j(                  d      ]t&        j*                  j-                  t/               dt1        | j                  j3                  dd                  t&        j4                  d<   || _        t9        d| j                  j:                  |      | _        t>        j@                  jB                  jE                  | j                  j3                  dd	            | _#        y )
Nz CachingAutotuner gets %d configsTRITON_CACHE_DIRrI   devicer   F)is_mmnamer@   kernel_nameztriton kernel)$super__init__fntriton_metainductor_metasave_cache_hookmutated_arg_namesrA   heuristic_typelogisEnabledForloggingDEBUGdebugr=   	launchers	threadingLocklockosgetenvpathjoinr   strgetenvironr@   r   r%   coordesc_tunerrG   _C	_profiler_RecordFunctionFastrecord_function_ctx)selfrV   rW   rA   rY   rZ   r[   r@   rX   c	__class__s             r/   rU   zCachingAutotuner.__init__   sR    	&#0#8Rm.!2,GMM*II8#dll:KL\\		! " NN$	99'(0-/WW\\D$$((156.BJJ)* %+dgg..:

 $)88#5#5#I#I""=/B$
 r.   c                 :   | j                   5  | j                  r
	 d d d        y g | _        g }| j                  D ]D  }	 | j                  ||      \  }}| j                  j                  |       |j                  |       F t        | j                        dk(  rt        d      t        | j                        }t        d      }|j                  j                  | j                  d         }t        j                  ru| j                  t         j"                  k(  rW| j$                  J|j&                  dk(  r:t)        | j                  |      D ]   \  }	}t        | j$                        dk(  sJ |	j*                  d   }
|	j*                  d   }| j$                  d   |
z   d	z
  |
z  }t-        |d
d       }|g|dk  rm|d|j.                  z  k  r|dz  }||	j0                  z  }t3        d|z  d	      }|||j4                  z  k  rt7        j8                  |	      }|dz  |j*                  d<   ||v r|j;                  |       | j                  j                  | j                  ||      d	          # d | _        d d d        y # t        $ r Y Tw xY w# 1 sw Y   y xY w)Nr   z9No valid triton configs. Report a fatal compilation errorcudarP      r:   XBLOCKRBLOCKr   n_regs@   i   r"   )rd   ra   rA   _precompile_configr   r>   r=   RuntimeErrorsetr   Workerget_device_propertiesrW   r   dynamic_scale_rblockr[   r$   r)   r@   majorzipkwargsgetattrmax_threads_per_multi_processor	num_warpsmaxmulti_processor_countcopydeepcopyadd)rq   warm_cache_only_with_cccompiled_binariesrr   compiled_binarylauncherseen_configsdevice_interfacedevice_propr?   xblockrblocktotal_blocknregnreg_per_warpnreg_per_blockmax_blocks_per_sm
new_configs                     r/   
precompilezCachingAutotuner.precompile   s   YY~~ Y  DN "\\040G0G21-OX %%h/!((9 " 4>>"a'"O  t||,L7?*11GG  *K ++''=+B+BBOO/  %%*69LL"372M? t/1444*11(;F*11(;F#'??1#5#>#Bv"MK"?HdCD|  |  u(S(SSS $(2IM%2]5L5L%LN ),E^,CQ(G% $,{/P/PPQ !!%}!=J28A+J%%h/!\1  $$Z0NN))//
<STUVWm7r  DLy Y &  Ys4   JJJH&J	J
JJJJcfgr   c           
         t        j                  | j                        }|j                  j	                         D ]0  \  }}||d   | j
                  j                  j                  |      <   2 |j                  |d<   |j                  |d<   t        j                  xr t        j                  j                  du |d<   t        j                  j                  dnd|d<   |r%t        j                   | j
                  fd	|d
|dfS t        j"                  j%                  |d         5  t        j"                  j'                  t        j"                  j)                                t        j                   | j
                  fi |}|j+                          ddd       t-        | j
                  j                        D cg c]  \  }}|| j
                  j.                  vr|! }	}}| j
                  j                  D 
cg c]  }
|
|j                  vs|
 }}
|j                  t        t        j"                  j0                  t        j"                  j(                  d}t3        ddj5                  |       ddj5                  |	       ddj5                  |	       dj7                         |       |d   }||_
        t9        |dd      |_        t9        |dd      |_        t9        |dd      |_        t        j                  j@                  |_         |j@                  r| j
                  |_        ||_!        ||fS # 1 sw Y   xY wc c}}w c c}
w )z/Ahead of time compile a given autotuner config.	constantsr   
num_stagesNr`   ru   hipdevice_typeT)warm_cache_onlyccrP   )	grid_metabinrG   
set_devicecurrent_devicez
            def launcher(z, a  , grid, stream):
                if callable(grid):
                    grid_0, grid_1, grid_2 = grid(grid_meta)
                else:
                    grid_0, grid_1, grid_2 = grid

                if hasattr(bin, "num_ctas"):
                    bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps,
                                bin.num_ctas, *bin.clusterDims, bin.shared,
                                stream, bin.cu_function, None, None, None,
                                z)
                else:
                    bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared,
                                stream, bin.cu_function, None, None, None,
                                z))
                return bin
            r   ry   n_spillsshared)"r   r   rW   r   itemsrV   	arg_namesindexr   r   r   assert_indirect_indexingrG   versionr   rI   compileru   rP   synchronizer   _init_handles	enumerate
constexprsr   execrh   lstripr   ry   r   r   store_cubinr   )rq   r   r   compile_metakvbinaryiarg	call_argsrR   def_argsscoper   s                 r/   r{   z#CachingAutotuner._precompile_config  s   }}T%5%56JJ$$&DAqDEL%dgg&7&7&=&=a&@A '$'MM[!%(^^\"++I0A0AT0I 	W
 160A0A0Ifu]#"GG$(. #	   ZZ|H56JJ""5::#<#<#>?^^F   " 7 $DGG$5$56
63*** 6 	 

 &*WW%6%6Q%6T$cjj:PD%6Q **//#jj77
 	))H-. 
/! "&9!5 6 7! "&9!5 6 7" '	
, $!&(D9#FJ=!&(D9%}}88''HK!HLxq 76

 Rs   &A,L5<$M;MM5L?c                F    j                   t        j                  j                  kD  r6t        j                  dj                  j                          t        d      S t        t        j                  j                                fd}t        |dd      S )z+Measure the performance of a given launcherz/Skip config %s because of register spilling: %dinfc            	         j                   j                  Pj                   j                  i t        t        j                              j                   j
                          j                  i \  } } | i |d y )Ngridstream)r   pre_hookdictr   r   r   
clone_args)cloned_argscloned_kwargsargsr   r   r   rq   r   s     r/   kernel_callz+CachingAutotuner.bench.<locals>.kernel_calls  s    ''3((QtC56Q(//:P:PQ *9$)I&)I&K 	r.   (   T)rep
fast_flush)r   r   rI   spill_thresholdr\   r`   floatr!   rG   ru   r   r   )rq   r   r   r   r   r   r   s   ````` @r/   benchzCachingAutotuner.benchg  sx    v}}<<<IIA!!
 < !:!:!<=	 	 ==r.   r7   c                    ddl m} g }t        |      D ]o  \  }}| j                  j                  |   | j
                  v r4t        |t        j                        sJ |j                   ||             _|j                  |       q i }|j                         D ]@  \  }}|| j
                  v r(t        |t        j                        sJ  ||      ||<   <|||<   B ||fS )Nr   )clone_preserve_strides)
compile_fxr   r   rV   r   rZ   
isinstancerG   Tensorr>   r   )	rq   r   r   r   r   r   r   r   rR   s	            r/   r   zCachingAutotuner.clone_args  s    6
 oFAsww  #t'='==!#u||444""#9##>?""3' & )+ID#t---!#u||444&<S&Ad#&)d# ( M))r.   c           	         | j                   D ci c]  }| | j                  |g|i | }}|j                         D ]+  \  }}| j                  j	                  |j
                  |       - t        j                  t        j                        rot        j                  d       |j                         D ]G  \  }}t        j                  d|j
                  ||j                  |j                  |j                         I |S c c}w )Nz Benchmark all input configs get:z*%s: %f, nreg %d, nspill %d, #shared-mem %d)ra   r   r   rl   cache_benchmark_resultr   r\   r]   r^   r_   r`   ry   r   r   )rq   r   r   r   timingsr   r   s          r/   benchmark_all_configsz&CachingAutotuner.benchmark_all_configs  s     !NN
* jdjj;D;F;;* 	 

 MMODAq66qxxC $ GMM*II891		@HHHHJJHH ( )
s   Dc                      | j                   |i |}t        j                  ||j                        g| _        | j
                  r)| j                  | j                  d   j                         yy)zDo the actual autotuningkeyr   N)r   builtinsminrj   ra   rY   r   )rq   r   r   r   s       r/   autotune_to_one_configz'CachingAutotuner.autotune_to_one_config  s]    ,$,,d=f=",,wGKK@A  !2!9!9:  r.   c                    t        |      r! ||j                  j                        \  }}}n|\  }}}| j                  j	                  dd       }|J d       |j
                  j                  d   ||||j                  j                  j	                  dd      |j                  j                  j	                  dd       |j                  j                  j	                  dd       |j
                  j                  |j
                  j                  ||j                  j                  d}t        j                  j                  /t        j                  |||j
                  j                  d	          y d
d l}	|	j!                  |j
                  j                  d         j#                         |j
                  j                  d<   t        j                  |||j
                  j                  d          y )NrS   zkernel_name can not be NonerR   rw   r   YBLOCKZBLOCK)mangled_namegrid_xgrid_ygrid_zx_blocky_blockz_blockr   
shared_memr   metacubinr   
hsaco_pathhsaco)callabler   r   rX   rj   r   metadatar   r   rG   r   r   r   r}   asmpathlibPath
read_bytes)
rq   r   r   r   r   r   r   r   paramsr   s
             r/   save_cuda_kernelz!CachingAutotuner.save_cuda_kernel  s   D>%)(//*@*@%A"FFF%)"FFF  $$]D9= ==$LL11&9--11(A>--11(DA--11(DA!//",,--OO**
 ==$ $$S&(,,2B2B72KL
 (/  .)jl LLW% !$$S&(,,2B2B72KLr.   c                      j                   t        j                  k(  s j                   t        j                  k(  r|S   j                  | \  }|j
                  |i fd} j                   t        j                  k(  rd|j
                  j                  v rJ d        j                  j                  ||j
                  d      }d|_
         j                  r j                  |d       j                  |      S )a%  
        Coordinate descent tuning can be run with or without max-autotune.

        The only difference between these two is the starting config for coordinate_descent tuning.
        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
        and max-autotune figure out C3 is the best.

        Then if coordinate descnt tuning is run with max-autotune disabled, it will start from C1;
        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
        c                 .   j                   5  j                  | d       \  }}d d d        | <    j                  |gi }t        j	                  d|j
                  ||j                  |j                  |j                         |S # 1 sw Y   hxY w)Nz4COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d)	rd   r{   r   r\   r`   r   ry   r   r   )r   _r   outr   config2launcherr   rq   s       r/   benchmark_one_configzHCachingAutotuner.coordinate_descent_tuning.<locals>.benchmark_one_config  s    "55fdC8 &.OF#$**X>>v>CIIF!! J s   BBrx   zpCoordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCKNT)found_by_coordesc)r[   r$   r+   r,   r   r   r*   r   rl   autotuner   rY   rj   )	rq   r   r   r   r   r   best_configr   r   s	   `  `   @@r/   coordinate_descent_tuningz*CachingAutotuner.coordinate_descent_tuning  s     =#9#99""m&A&AA O($/Q#??H5	" =#E#EEHOO222	~ ~	~ 
 ))22 (//4
 )-%   E"";//r.   c          	      Z   t        | j                        dk7  rTt        | j                        dk(  r| j                          t        | j                        dkD  r | j                  |d|i| t	        | j                  d   j
                  dd      s;t
        j                  r+ | j                  | j                  d   g|d|i|g| _        | j                  \  }|j                  r| j                  |||       |j
                  j                  R|j
                  j                  i t        t        | j                  |            |j
                  j                  |       t        j                  r#| j                   5   ||i |||dcd d d        S  ||i |||dS # 1 sw Y   y xY w)Nr   r   r   r   Fr   )r=   ra   r   r   r   r   r  r   r   r   r   r   r   r   autograd_profiler_is_profiler_enabledrp   )rq   r   r   r   r   r   s         r/   runzCachingAutotuner.run  s   t~~!#4>>"a'!4>>"Q&+++TGGG q)002EuM00 /..NN1%(,37;ADN nn!!$9??##/OO$$W4DNND12Whoo6L6LWPVW 11)) !	 *)  	  *)s   =F!!F*NNN)r%   r&   r'   __doc__rU   r   r   r	   intr{   r   r   r   r   r   ri   r   r   r   r   r   r  r  __classcell__rs   s   @r/   rM   rM      s     +
Z] ~R f R xPS} R h>8*U49d38n3L-M *0  .;"MH00d-r.   rM   c                     dd l }dd l} |j                         }||j                   |j                  }|g }|j                  |       D ]A  }t        |t              s|j                         D ]  \  }}|| u s|j                  |        C |S Nr   )
gcinspectcurrentframef_localsf_backget_referrersr   r   r   r>   )objr  r  frame	obj_namesreferrerr   r   s           r/   _find_namesr  @  s     G  "E

 
 I$$S)h% (18$$Q' ) *
 r.   collected_callsc                  ,    t         j                          y r  )r  clearr-   r.   r/   start_graphr  T  s    r.   c            
      .   t        t              dk(  ry t        d t        D              } t        d t        D              }t        j                         d   j
                  }d| d| dd|dd	|| d
z  z  dd	}t        |       t                t        j                  }|t        t        d d      }	 t        |d      5 }t        j                  d|       |j                  d       |j                  d| d       |D ]>  \  }}}	}
|| z  dz  dd}d| d|
 }t        |||	|      }|j                  |dz          @ |j                  | d       d d d        y y # 1 sw Y   y xY w# t        $ r!}t        j!                  d||       Y d }~y d }~ww xY w)Nr   c              3   &   K   | ]	  }|d      yw)r   Nr-   .0calls     r/   	<genexpr>zend_graph.<locals>.<genexpr>[  s     ;?4tAw?   c              3   &   K   | ]	  }|d      ywr   Nr-   r   s     r/   r#  zend_graph.<locals>.<genexpr>\  s     9T!Wr$  r   z	SUMMARY (z)
z.2fzms   	 z GB	      @@zGB/sc                     t        | d         S r  )r   )rr   s    r/   <lambda>zend_graph.<locals>.<lambda>h  s    U1Q4[r.   T)r   reverseaz$Save profile bandwidth results to %sz====================
zTRITON KERNELS BANDWIDTH INFO (d   % 	 suffix
z

z4failed to write profile bandwidth result into %s: %s)r=   r  sumr  stackfilenameprintr   profile_bandwidth_outputsortedopenr\   r`   writer   	Exceptionwarning)overall_time
overall_gbcur_filesummary_stroutput_filesorted_callsfilemsnum_gbgb_per_srS   
percentager0  bw_info_stres                  r/   	end_graphrI  X  s   
?q ;?;;L999J}}q!**H
H:S
HZ$4F:|TWGW;XY\:]]a	c  
+	G11K o3HRVW	k3'4		@+N

34

<XJcJK9E5B+$&|OC$7#<A!>J#J<tK=AF";FHV#K JJ{T12 :F 

k]$/0 (' 
 ('  	KKF 	s7   4E*  BEE* E'#E* 'E* *	F3FFc                   .     e Zd Zdd fd
Z fdZ xZS )DebugAutotuner )regex_filterc                @    || _         t        |   |i | d | _        y r  )rM  rT   rU   cached)rq   rM  r   r   rs   s       r/   rU   zDebugAutotuner.__init__  s$    ($)&)r.   c          	      V   t        |       }t        |t               }t        j                  | j
                  |      sy t        |   |||d | j                  \  }| j                  v | j                  |g|d|i}t        | j                  j                  D cg c]  }|j                  d      r| c}      }	t        |d|	idz  }
|
|dz  z  }||
||f| _	        n| j                  \  }}
}}t        j!                  ||
||f       t#        t%        ||
|d| 	             y c c}w )
Nr   r   r   
in_out_ptrnum_in_out_argsg    eAr'  r.  r/  )r  r   r=   rematchrM  rT   r  ra   rO  r   rV   r   
startswithr   r  r>   r5  r   )rq   r   r   r   possible_namesrS   r   rC  arg_namenum_in_out_ptrsrD  rE  rs   s               r/   r  zDebugAutotuner.run  s0   $T*^56xx));7TV4nn;;H7t7$7B! %)GG$5$5$5**<8 $5O #DJ/JSPFc*Hvx=DK04-B+FHkBC%b&(T+CWX	
s   D&)r%   r&   r'   rU   r  r  r  s   @r/   rK  rK    s    +- 

 
r.   rK  rA   c           	         t        j                         }| D ]_  }|j                  t        |j                  j                                d|j                   d|j                   dj                                a |j                         S )z:
    Hash used to check for changes in configurations
     r1  )
hashlibsha256updater7  r   r   r   r   encode	hexdigest)rA   hasherr   s      r/   hash_configsra    sr     ^^Fcjj&&()*!CMM?!CNN;K2NUUW	
  r.   cache_filenameconfigs_hashc                   	 t         j                  j                  |       syt        |       5 }t	        j
                  |j                               	ddd       	j                  dd      |k7  ryt        j                  rK	j                  dd      r9	j                  d      }	j                  d      }t        	||      }d|_        |S |D cg c]l  }t        	fd	|j                  j                         D              r>|j                  	j!                  d      k(  r |j"                  	j!                  d      k(  r|n }}t%        |      d
k7  ry|d   S # 1 sw Y   xY wc c}w )z3
    Read a cached autotuning result from disk
    Nrc  r   Fr   r   r   r   Tc              3   L   K   | ]  \  }}|j                  |      k(    y wr  rj   )r!  r   valr  s      r/   r#  z)load_cached_autotuning.<locals>.<genexpr>  s&     N;MxsCskooc**;Ms   !$r   r   )re   rg   existsr8  jsonloadsreadpopr   r  r   r   allr   r   r   rj   r   r=   )
rb  rc  rA   fdr   r   r?   r   matching_configsr  s
            @r/   load_cached_autotuningrq    s3    77>>.)	n	jj+ 
~t,<''KOO<OQV,WOOK0	 __\2
{iJW*.' CN3::;K;K;MNNMM[__[99NNkool;;	 	   !A- 
	s   $EA1E"Er@   c                 p   	
 t              t              dk(  s|sJ i n|ht              dkD  st        j                  rJt        j
                  j                  |      d   dz   t              	t        	      }|r|gd		fd	ndj                  dd      

 fd}|S )
z
    A copy of triton.autotune that calls our subclass.  Our subclass
    has additional debugging, error handling, and on-disk caching.
    r   Nr   z.best_configc                 `   t        d      5 }|j                  t        j                  i | j                  | j
                  | j                  |d             d d d        t        j                  t        j                        r|rdnd}t        j                  d|       y y # 1 sw Y   KxY w)Nw)r   r   rc  r   coordesc	heuristiczSave %s tuning result to %s)r8  r9  rj  dumpsr   r   r   r\   r]   r^   r_   r`   )r   r   ro  type_strrb  rc  s       r/   rY   z(cached_autotune.<locals>.save_cache_hook  s    nc*bJJ!jj),*-..,81B
 + .)::		7>R / +*s   AB$$B-rZ   r-   c                 p   dd l }d |j                  | j                        j                  vrED ]@  }d|j                  v s|j                  d   dk(  sJ |j                  j                  d       B t        j                  r"t        | 	t        j                  	      S t        | 	      S )Nr   rw   r   )rW   rX   rM  rA   rY   rZ   r[   r@   )rW   rX   rA   rY   rZ   r[   r@   )r  	signaturerV   
parametersr   rm  r   profile_bandwidthrK  profile_bandwidth_regexrM   )
rV   r  tconfigrA   r[   rX   rZ   rY   r@   rW   s
      r/   	decoratorz"cached_autotune.<locals>.decorator  s    
 	,7,,RUU3>>>"w~~-">>(3q888NN&&x0 #
 ##!'+#;; /"3-%
 
  #'+/)!	
 		
r.   )F)
unique_configsr=   r   r  re   rg   splitextra  rq  rm  )r@   rA   rW   r[   r4  rX   r  r  rb  rc  rZ   rY   s   ```` `  @@@@r/   cached_autotuner    s     W%Gw<1(('/B]M W!1V5U5U))(3A6G#G,,^\7S"mG	S$ %))*=rB"
 "
H r.   c                     t               }g }| D ]4  }t        |      }||vs|j                  |       |j                  |       6 |S )zRemove duplicate configurations)r}   r   r   r>   )rA   seenpruned_configsr   r   s        r/   r  r  %  sJ    5DN',d?HHSM!!#&	 
 r.   xnumelynumelznumelc                6   t        |||fd      D ]  \  }}|	| | d   }|dk(  r*|dk(  s%J d|j                          d| d| d| d|  d	       t        j                  j                  |   }d
| d}||z  dk(  rlJ d| d| d| d| d| d| d|  d	        y )NXYZBLOCKr   z;TritonKernel.indexing assumes numel == 1 => BLOCK == 1 but znumel==z and zBLOCK=z (cfg=z).zconfig.triton.max_block["z"]r   zTritonKernel.indexing assumes zBLOCK divides z but =)r   lowerr   rI   	max_block)	r   r  r  r  numellabelblockr  max_block_strs	            r/   check_configr  2  s    VVV4e<u=ugUO$A:A: geWE%ugVTWSXXZ\: MM++E2	3E7"=5 A% 	
,UG>-E7&u]O1YKvcURTV	
% =r.   c                 <   t        t        |             } g d}t        |||      }t        |  |k  r|dz  }t        || d         }|rt        || d         }|rt        || d         }|t        | d   t        j
                  j                  d         k  rp||d   z  | d   k  st        |||      |k  rR|dz  }|t        | d   t        j
                  j                  d         k  r ||d   z  | d   k  rAt        |||      |k  rR|r|t        | d   t        j
                  j                  d         k  rr||d   z  | d   k  st        |||      |k  rT|dz  }|rM|t        | d   t        j
                  j                  d         k  r ||d   z  | d   k  rCt        |||      |k  rT|r|t        | d   t        j
                  j                  d         k  rr||d   z  | d   k  st        |||      |k  rT|dz  }|rM|t        | d   t        j
                  j                  d         k  r ||d   z  | d   k  rCt        |||      |k  rTt        t        t        t        |||      |z  d      d            }	t        |||      d	k\  rt        |	d
      n|	}	| d   }
|r| d   nd}|r| d   nd}t        t        |||      |t        z  |	z        }|t        j                  |t        |||      z        z  }d|i}|r||d<   |r||d<   t        ||
||       t        ||	|      S )a  
    Construct a pointwise triton config with some adjustment heuristics
    based on size_hints. Size_hints is a tuple of numels in each tile
    dimension and will be rounded up to the nearest power of 2.

    num_elements_per_warp is a suggestion for controlling how many warps
    the triton config should contain. e.g.: if x=16, y=8, z=4 then
    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
    just a suggestion, and sometimes other adjustment heuristics will
    override the num_elements_per_warp.

    min_elem_per_thread controls the minimum number of elements
    processed by each thread. It's always enforced.
    )i  r  rv   r   r   r:   XYZ   r9   Nrw   r   r   r  re  )listreversedr   r   r   rI   r  r   r   _NUM_THREADS_PER_WARPmathceilr  r   )r@   xyzr   r<   min_elem_per_threadmaxGridSizetargetr   r  r  r  r6   r   s                  r/   r?   r?   D  sW   6 hz*+J,K Aq)FJ'&01 	Az!}A:a=!:a=! c*Q-!8!8!=>
>	KNZ]*.A!Q.JV.S	Q c*Q-!8!8!=>
>	KNZ]*.A!Q.JV.S 	
JqM6==#:#:3#?@@AA.2EaA2NQW2W 	
Q 	
JqM6==#:#:3#?@@AA.2EaA2NQW2W
 	
JqM6==#:#:3#?@@AA.2EaA2NQW2W 	
Q 	
JqM6==#:#:3#?@@AA.2EaA2NQW2W
  C#Aq!,0EEqI1MI &9Aq%AS%HIq!iI]FZ]TFZ]TF Aq!$33i?J : 3Aq! <<	==AQ-CHHVF6B#zBBr.   c                    t        ||      }t        |  |k  r|dz  }t        || d         }t        || d         }|| d   k  r,t        ||      |k  r|dz  }|| d   k  rt        ||      |k  r|| d   k  r,t        ||      |k  r|dz  }|| d   k  rt        ||      |k  r||d}|t        ||      dz  }t        t        t        |d      d            }t	        || d          t        |||      S )	z
    Construct a reduction triton config with some adjustment heuristics
    based on size_hints. Size_hints is a tuple of numels in each tile
    dimension and will be rounded up to the nearest power of 2.
    rv   r   r   r:   )rw   rx   r  )r  re  r   r   r   r   r  r   )r@   r  rr   r   r  r   s          r/   triton_config_reductionr    s$    !A&FJ'&01 	Az!}AAz!}A jm
 3Aq 9F B	Q jm
 3Aq 9F B
jm
 3Aq 9F B	Q jm
 3Aq 9F B !
$C'1-4	C	1$5q 9:IZ]+#zBBr.   c                    t        |||      }t        |  |k  r|dz  }t        || d         }t        || d         }t        || d         }|| d   k  r.t        |||      |k  r|dz  }|| d   k  rt        |||      |k  r|| d   k  r.t        |||      |k  r|dz  }|| d   k  rt        |||      |k  r|| d   k  r.t        |||      |k  r|dz  }|| d   k  rt        |||      |k  r|||d}t        t        t        t        |||      dz  d      d            }t	        || d   | d          t        |||      S )	z
    Construct a tile reduction triton config with some adjustment
    heuristics based on size_hints. Size_hints is a tuple of numels in
    each tile dimension and will be rounded up to the nearest power of 2.
    rv   r   r   r:   )rw   r   rx      )r  r  re  r  )r@   r  r  r  r   r  r   r   s           r/   triton_config_tiled_reductionr    s    !Aq)FJ'&01 	Az!}AAz!}AAz!}A jm
 3Aq! <v E	Q jm
 3Aq! <v E
jm
 3Aq! <v E	Q jm
 3Aq! <v E
jm
 3Aq! <v E	Q jm
 3Aq! <v E !q
1CC(;Aq!(D(KQ$OQR STIZ]:a=A#zBBr.   c                 6   |i n|}t        j                  t        j                  |       }t	        dt        |dz  d            }t        |j                  dt                     | |      }t        j                  t        |      }	t        |       dk(  rt               rGt        j                  s7t        j                  s't!        |  |	| |      g||t"        j$                  |      S t!        |  |	| |d       |	| |d	z  d
      g|||t"        j$                  |      S t        |       d	k(  rt               s|t&        j(                  k(  rHt        j                  s8t        j                  s(t!        |  |	| dd      g||t"        j$                  |      S t!        |  |	| dd       |	| d
d
       |	| dd       |	| dd       |	| |d       |	| d|      g||||t"        j$                        S t        |       dk(  rt               r)t!        |  |	| ddd      g||t"        j$                  |      S t!        |  |	| ddd       |	| d
dd       |	| dd
d       |	| ddd
       |	| |dd       |	| d|d       |	| dd|      g||||t"        j$                        S t+        d|        )z=
    Construct @triton.heuristics() based on size_hints.
    r  r  i   autotune_hints)r  r   rW   rX   r[   r4  )r<   r:   rz   r"      rW   rX   r4  r[   r;   rv   size_hints: )	functoolsreduceoperatormulr   r   rE   rj   r}   partialr?   r=   rK   r   max_autotunemax_autotune_pointwiser  r$   r(   r   SQUARENotImplementedError)
r@   rW   	tile_hintr4  r  rX   r  bshinted_configstriton_config_with_settingss
             r/   	pointwiser    s    (/B]MX\\:6E	S#eslD)	*B.*CE2JN #,"3"3+># :!')6#@#@",Z<='+,66!  #/"Bc 0"B!G2	 $ (+,66!   :!(*i8??.J6#@#@",ZR@A'+,66!  +JB?+JB?+JR@+JC@+JA>+J2>   $'(22
 	
  :!')",ZRDE'+,66!  +JBC+JAqA+J2qA+J1bA+JAqA+J2qA+J1bA	  	 $'(22
 	
" ZL9
::r.   c                 d   |i n|}|J | d   }t        |       dk(  rt        | dd|cxk  rdk  rn n|nd      }t        | dd      }t        | |dk  rdd|z  z  ndt        |d            }t        j                  st        j
                  rn|t        j                  k(  r t        | |g||t        j                  |      S |t        j                  k(  r t        | |g||t        j                  |      S |t        j                  k(  r t        | |g||t        j                  |      S t               r+t        | t        | d	d
      g||t        j                  |      S t        | |||t        | dd      t        | dd      t        | ddd      g|||t        j                        S t        d|        )zargs to @triton.heuristics()r:   r   r  i   rz   rv   r  r"   r  i   r9   )r   r  r  )r=   r  r   r   r  r  r   INNERr  r$   r)   OUTER
OUTER_TINYrK   r  )	r@   reduction_hintrW   r4  rX   rnumelcontiguous_configouter_configtiny_configs	            r/   	reductionr  M  s    (/B]M"""^F
:!3cV&:d&:F
 /z2qA-v}SF]+!SQUEV
 &"?"?}222""#'+,66!  }222"'+,66!  }777"'+,66!  ()"(R=>'+,66!  !'
B;'
As; (
BQG
 $'(22!
 	
$ ZL9
::r.   c                    | \  }}dD cg c]  }||z  dk  r||k  rt        | ||       }}|t        j                  k(  r|dk\  r|d d }nG|t        j                  k(  r|dd  }n.|t        j                  k(  rt        | |dk  rdd|z  z  nd|      g}|D ]  }	|	j
                  j                  d        t               r|d d }t        | ||||t        j                        S c c}w )	N)r   rv   r"   r  i   r  r   r  r:   rx   r  )r  r   r  r  r  r   rm  rK   r  r$   r*   )
r@   r  rW   r4  rX   r  r  r   rA   rr   s
             r/   persistent_reductionr    s     NFF &%FF?d"v'7 	 
FF;%   ,,,3"1+	=..	."#,	=33	3#6S=A/a

 	X  $%"1+#$99 1s   !C"c                 n    t        dt        j                  i | |      g||t        j                  |      S )z#
    Compile a triton template
    Nr   r   r  r  rI   r   r$   r+   )r   r   rW   r4  rX   s        r/   templater    s9     	rjI	FG#$-- r.   c                    t        j                  t        j                        j                  }|d   j
                  }|d   j
                  }t        |       dk(  rt        j                  i ||      g} nU| D cg c]J  }t        j                  |j                  di       |j                  d|      |j                  d|            L } }t        d| |t        j                  ||      S c c}w )z.
    Compile a user defined triton kernel
    r   r   r   r  r   N)rW   r[   r4  rX   )r  rz  rI   r   r{  defaultr=   rj   r  r$   r,   )rA   rW   r4  rX   defaultsdefault_num_stagesdefault_num_warpsrr   s           r/   user_autotuner    s       /::H!,/77 -55
7|qMM1=N
 
  MMh#55/AB%%->?
  	 
 $22# 
s   8AC(c                 n    t        dt        j                  i d|      g| |t        j                  |      S )z)
    Compile a triton foreach kernel
    Nr   r  r  r  )rW   r   r4  rX   s       r/   foreachr    s9     	ra9	=>#$-- r.   c                      t        |       dk(  r| d   ddcnRt        |       dk(  r| d   | d   dcn6t        |       dk(  r| d   | d   | d   cnt        dt        |              d fd}|S )	z'Helper function to compute triton gridsr   r   Nr:   r;   zinvalid size for numels c                 (    | y|| S t        | |      S )Nr   )r   )r  r  s     r/   get_grid_dimzgrid.<locals>.get_grid_dim  s!    ==Lue$$r.   c           	           | j                  dd             | j                  dd              | j                  dd             fS )Nrw   r   r   r   rg  )r   r  r  r  r  s    r/   grid_fnzgrid.<locals>.grid_fn  sL    (A!67(D!9:(D!9:
 	
r.   )r=   AssertionError)numelsr  r  r  r  r  s     @@@@r/   r   r     s    
6{a!'D$	V	!'F1It	V	!'F1Ivay7F}EFF%
 Nr.   r  )NNr   r  r   r&  )r   )NNr   N)FNNN)^r   r   r  r[  r  rj  r^   r  r  re   os.pathrS  rb   enumr   r   typingr   r   r   r   r	   r
   r   rG   torch.autograd.profilerautogradprofilerr  torch._dynamo.device_interfacer   torch._dynamo.utilsr   torch.utils._tritonr   r   rL  r   	codecacher   r   coordinate_descent_tunerr   irr   r   utilsr   r   r   r   r   r   r   	getLoggerr%   r\   rI   r   triton.runtime.autotunerr   triton.runtime.jitr    objectr!   r  r$   r1   r
  rE   rK   rM   r  r  __annotations__r  rI  rK  ra  ri   rq  r  r  r  r?   r  r  r  r  r  r  r  r  r   r-   r.   r/   <module>r     s@            	  	   B B B  3 3 C , >  6 3 '   g!72FFON<2O  D 4 $|$69$	&\$N0~ ~B"  c $N
% 
D	$v, 	'*59&\N Sc#S&\Sl
DL 
 !%T$ 
* 
\C \C~Cv C:CB l;b L;b (VDr.   