
    Phz                        U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlmZ d dlmZmZmZ ddlmZmZ dd	lm Z m!Z! dd
l"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;  ejx                  e=      Z> e?       Z@eeAef   eBd<   dZCdZD G d d      ZE eE       ZF G d d      ZG G d de,      ZH ej                  d      d        ZJ G d de)      ZK G d d      ZL G d d e'      ZM G d! d"e'      ZN G d# d$eO      ZP G d% d&e$      ZQdaRd' ZSd( ZTdd)lmUZU y)*    N)StringIO)AnyCallableDictListOptionalTypeUnion)patch)rand_strided)countersidentitypreserve_rng_state   )configir)
TensorMetaTritonBenchmarkRequest)	code_hashPersistentCachePyCodeCache)ChoiceCallerIndentedBufferKernelTemplate)texprTritonKernelTritonPrinterTritonScheduling)	config_ofsignature_to_meta)CUDACompileError)do_benchPlaceholder	sympy_dotsympy_productunique)VVERIFYTFc                       e Zd Zy)KernelNamespaceN)__name__
__module____qualname__     kC:\Users\daisl\Desktop\realtime-object-detection\venv\Lib\site-packages\torch/_inductor/select_algorithm.pyr*   r*   &   s    r/   r*   c                   (     e Zd ZdZ fdZd Z xZS )PartialRenderz
    Some parts of a template need to be generated at the end, but
    inserted into the template at the start.  This allows doing a bunch
    of replacements after the initial render.
    c                 >    t         |           || _        || _        y N)super__init__codereplacement_hooks)selfr7   r8   	__class__s      r0   r6   zPartialRender.__init__5   s    	!2r/   c                     | j                   }|J d       d | _         | j                  j                         D ]  \  }}|j                  | |             } |S )Nzcan only be called once)r7   r8   itemsreplace)r9   r7   keyfns       r0   finalizezPartialRender.finalize:   sX    yy:!::	--335GC<<RT*D 6r/   )r+   r,   r-   __doc__r6   r@   __classcell__r:   s   @r0   r2   r2   .   s    3
r/   r2   c                        e Zd Zdddef fd	Zd Zd Zd Zdede	fd	Z
d
 Zd Zd Zd Zd Zdddddej"                  f fdZ fdZddedeej,                     fdZ xZS )TritonTemplateKernelTr   c                n   t         |   t        |j                               t	        j
                  d      |       || _        || _        i | _        || _	        || _
        d | _        |
| _        || _        || _        || _        || _        |	| _        || _        || _        || _        t+               | _        y )Nr   )index_dtype)r5   r6   r%   get_sizesympyIntegerinput_nodesoutput_nodenamed_input_nodesdefineskernel_nametemplate_maskuse_jit
num_stages	num_warpsgrid_fnmeta
call_sizesprefix_argssuffix_argsepilogue_fndictrender_hooks)r9   rO   rK   rL   rN   rR   rS   rT   rU   rV   rQ   rW   rX   rY   rG   r:   s                  r0   r6   zTritonTemplateKernel.__init__D   s    $ 	+..01MM!# 	 	

 '&!#&!$"	$&&& Fr/   c                      y)NFr.   r9   s    r0   need_numel_argsz$TritonTemplateKernel.need_numel_argsm   s    r/   c                     | j                   ry| j                  j                         \  }}}t        || j                        t
        j                  j                  j                  j                  t
        j                  j                  j                  j                  i d}t        |      g|d<   dt        t        j                        i}t        j                   d| j"                   d| j$                   d|d	|d
	      S )Nz@triton.jit)
size_dtype)	signaturedevicedevice_type	constantsconfigsrO   z3
            @template(
                num_stages=z,
                num_warps=z,
                triton_meta=z ,
                inductor_meta=z4,
            )
            @triton.jit
            )rQ   argspython_argdefsr    rG   r'   graph	schedulercurrent_deviceindextyper   strr#   DESCRIPTIVE_NAMEtextwrapdedentrR   rS   )r9   argdefs_ra   triton_metainductor_metas         r0   jit_linezTritonTemplateKernel.jit_linep   s    <<  $		 8 8 :I*9AQAQRgg''66<<77,,;;@@	
 #,I"6!7I&K,H,H(IJ OO, ->>* +(O ,,/ 0

 
	
r/   c                     t        d |D              sJ t        d       j                   j                  t	         j                         j
                  z
   }t	        |      t	        |      k(  s:J t	        |      t	        |       j                  t	         j                        f        j                  d j                   D ]+  } j                  j                  |j                                - t        ||      D ]@  \  }}d| }| j                  |<   | j                  j                  |j                         <   B |D ]  } j                  |   } j                  j                  |j                            }|j                         j                  dk(  rj                  | d|        mt         j!                  |j                         j                              }j                  | d| d|          j                  t	         j                         j
                  z
  d D ]+  } j                  j                  |j                                -  fd	}d
 j"                  vsJ | j"                  d
<   y
)zb
        Hook called from template code to generate function def and
        needed args.
        c              3   <   K   | ]  }t        |t                y wr4   )
isinstancerm   .0xs     r0   	<genexpr>z2TritonTemplateKernel.def_kernel.<locals>.<genexpr>   s     8x!:a%x   r   )initial_indentNarg_r   z =  + c                      j                   j                         ^} }dj                  ddddddj                         dj                   d	d
j                  |        dj
                  j                         g
      S )N
zimport triton.language as tlzimport tritonz6from torch._inductor.triton_heuristics import templatez5from torch._inductor.utils import instance_descriptorz*from torch._inductor import triton_helpers zdef (, z):)rf   rg   joinru   rO   rN   getvalue)arg_defsrr   renamesr9   s     r0   hookz-TritonTemplateKernel.def_kernel.<locals>.hook   s    99335LHq992#LK@MMO4++,Adii.A-B"ELL$$& r/   z<DEF_KERNEL>)allr   rK   rW   lenrX   rf   inputget_nameziprM   input_buffers
get_layoutoffset	writeliner   rename_indexingr[   )	r9   argnames
named_args
input_nodenamearg_namer   r   r   s	   `       @r0   
def_kernelzTritonTemplateKernel.def_kernel   sJ   
 8x8888 2%%s4#3#34t7G7GG

 8}J/ 	
M
O  !	2
 	
/ **+=T-=-=>JIIOOJ//12 ? !$Hj 9D*dV}H+5D""4(=EDII##J$7$7$9: !: D//5Jyy..z/B/B/DEH$$&--2!!TF#hZ"89t33J4I4I4K4R4RST!!TF#hZs6("CD  **3t/?/?+@4CSCS+S+UVJIIOOJ//12 W	$ T%6%6666,0.)r/   r   rk   c                     t        |t              sJ || j                  j                         |   }n2t        |t              sJ | j
                  |   j                         |   }t        | j                  |            S )z
        Hook called from template code to get the size of an arg.
        Will add needed args to pass it in if it is dynamic.
        )rx   intrL   rH   rm   rM   r   r   r9   r   rk   vals       r0   sizezTritonTemplateKernel.size   sv    
 %%%%<""++-e4CdC(((((.779%@CT))#.//r/   c                     t        |t              sJ || j                  j                         |   }n2t        |t              sJ | j
                  |   j                         |   }t        | j                  |            S )z
        Hook called from template code to get the stride of an arg.
        Will add needed args to pass it in if it is dynamic.
        )rx   r   rL   
get_striderm   rM   r   r   r   s       r0   stridezTritonTemplateKernel.stride   sv    
 %%%%<""--/6CdC(((((.99;EBCT))#.//r/   c                     t        |t        t        f      sJ t        |t              sJ t        |t              sJ  j                  J t        t        t        j                  |            }|D cg c]  }t        j                  |       }} j                  j                         D cg c]+  }t        j                  j                  j                  |      - }}t!        |      t!        |      k(  sJ t#        | j$                  d   j'                  |            D ]  \  }}	|	j)                  |        t+        t,        j.                  j1                  |      |      }
 j3                  |
      }
 j4                  j7                  dt9        |
      z           j$                  d   j;                  t        j<                  d      t?        |            j)                  d       | _        | _           j                  jC                         jE                         |      } j3                  |      }||
k(  rt        j                  d      }|g}tG        jH                   jJ                  d jL                    jJ                  t!         jJ                         jN                  z
  d       D ]7  }|jQ                          |jS                   |jU                         |             9 t        jV                  jY                   j                  j[                         |  j\                  |         j_                           fd}d j`                  vsJ | j`                  d<   yc c}w c c}w )z
        Hook called from template code to store the final output
        (if the buffer hasn't been optimized away), then append any
        epilogue fusions.
        Nr   z	xindex = r   xindexc                       j                          t        j                   j                  j	                         d      j                         S )N    )codegen_bodyro   indentbodyr   stripr]   s   r0   r   z/TritonTemplateKernel.store_output.<locals>.hook  s6    ??499#5#5#7@FFHHr/   z<STORE_OUTPUT>)1rx   listtuplerm   rP   mapr   parenrI   SymbolrL   rH   r'   rh   sizevarssimplifyr   r   range_treesconstruct_entriesset_namer$   r   FlexibleLayoutcontiguous_stridesr   r   r   r   lookuprJ   r%   template_indicesr   make_indexer	itertoolschainrK   rW   rX   freeze_layoutappendmake_loaderopsstorer   rY   r   r[   )r9   indicesr   maskr{   index_symbolsslengthsr   range_tree_entrycontiguous_indexoutput_indexepilogue_argsr   r   s   `              r0   store_outputz!TritonTemplateKernel.store_output   s    'D%=111#s###$$$$!!)))s=..8929:'Qa':9=9I9I9R9R9TU9TA177##,,Q/9TU7|s7|+++ '*T%%a(::7C'
"D" %%d+'
 %009=
  //0@A		K%0@*AAB""5==#3]75KLUU	
 " 'Ct''224AACMR++L9++ <<1L#///t//0S!1!12T5E5EEGH
J $$&  !9!7!7!9-!HI
 	
%%'Dm,	

 		I
  t'8'8888.2*+] ;Us   3M-0M"c                 n    t         |j                  di | j                         || j                        S )Nr.   )r2   rendertemplate_envr[   )r9   templatekwargss      r0   r   zTritonTemplateKernel.render  s7    HOO<d//1<V<
 	
r/   c                     t        |t        t        f      sJ t        |t              sJ t        |t              sJ  j                  |   j                         }t        t        t        j                  |            }t        |      t        |      k(  sJ dj                   fdt        ||      D              }d| d| d| dS )zw
        Optional helper called from template code to generate the code
        needed to load from an tensor.
        r   c              3   b   K   | ]&  \  }}t        j                  |             d |  ( yw)z * N)r   r   )rz   r   ir9   s      r0   r|   z1TritonTemplateKernel.make_load.<locals>.<genexpr>0  s6      
BV$!QuT))!,-.c!5BVs   ,/ztl.load(z + (z), ))rx   r   r   rm   rM   r   r   r   r   r   r   r   )r9   r   r   r   r   rk   s   `     r0   	make_loadzTritonTemplateKernel.make_load%  s    
 'D%=111$$$$$$$$''-88:s=..897|s6{***

 
BEfgBV
 
 $tE7#dV155r/   c                     | j                   | j                  | j                  | j                  | j                  fD ci c]  }|j
                  | c}S c c}w )zA
        Generate the namespace visible in the template.
        )r   r   r   r   r   r+   r9   r?   s     r0   r   z!TritonTemplateKernel.template_env5  s\     		!!	
 KKO	
 		
 	
s   ANF)
copy_shapedense_indexingoverride_maskc                b    t         |   |d| j                  | j                        ^}}|g|S )zh
        Override the default indexing to use our custom mask and force
        dense indexing.
        F)r   r   r   )r5   indexingrP   )r9   rk   r   r   r   resultr   r:   s          r0   r   zTritonTemplateKernel.indexingD  sC     ( )),,	 ) 
 r/   c                     t         |   |       | j                  j                          | j                  j                          y r4   )r5   initialize_range_treer   clearindexing_code)r9   	pid_cacher:   s     r0   r   z*TritonTemplateKernel.initialize_range_treeX  s0    %i0		  "r/   nodec           
         t         j                  j                  }| j                  j	                         \  }}}|D cg c]  }t        |       }}t        t        |            D ]^  }t         j                  j                  ||         r||   dz   ||<   t        ||   t        j                        sNt        ||         ||<   ` t         j                  j                  r| j                  D cg c]+  }t         j                  j                  j!                  |      - c}| j"                  gz   }	 | j$                  |	 }
|j'                  ||t         j                  j(                  j*                  j,                  |
       y |j/                  t         j                  j(                  j*                  j,                        }|j1                  d| j$                  j2                          |j5                  | j"                        }| j                  D cg c]4  }t        t         j                  j                  j!                  |            6 c}|gz   }| j$                  j2                   d| j$                  j6                   ddj9                  |       d}|j;                  | ddj9                  |       d	| d
| d       y c c}w c c}w c c}w )Nz.item())device_indexgridzimport .r   r   r   z.run(z, grid=z	, stream=)r'   rh   wrapper_coderf   rg   rm   ranger   is_unspec_argrx   rI   r   r   cpp_wrapperrV   r   r   rU   rT   generate_kernel_callri   rj   rk   write_get_raw_streamadd_import_oncer,   add_meta_oncer+   r   r   )r9   r   r   wrapperrr   	call_argsar   r   	grid_argsr   stream_namerU   	grid_calls                 r0   call_kernelz TritonTemplateKernel.call_kernel^  s`   ''&&))2249a%./YSVY	/s9~&Aww$$Yq\2(|i7	!)A,5$Yq\2	!	 ' 77 @DO!))2215O		S I  4<<+D((WW..==CC	 )  "66!!0066K ##gdll.E.E-F$GH((3D >B__=Lagg&&//23_I  <<2231T\\5J5J4K1TYYW`MaLbbcdI&dii	2379+Y{m[\]K 0 P&s   J=.0K9Kr4   )r+   r,   r-   r   r6   r^   ru   r   rm   r   r   r   r   r   r   r   rI   Exprr   r   r   r   IRNoder   rB   rC   s   @r0   rE   rE   C   s     '#R
4>@0 0S 009 v
6 
& zz(#* *8BII+> *r/   rE   c                  b    	 dd l } | j                  | j                        S # t        $ r Y y w xY w)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r   s    r0   _jinja2_envr     s?    !!,, " 
 	
  s   " 	..c                        e Zd ZU  ej                         Z e       Zee	d f   e
d<   dde	dede	f fdZddefdZ xZS )	TritonTemplateall_templatesr   r   sourcec                     t         |   |       || _        | j                  |      | _        || j
                  vsJ d       | | j
                  |<   || _        y )Nzduplicate template name)r5   r6   r   _template_from_stringr   r  debug)r9   r   r   r  r  r:   s        r0   r6   zTritonTemplate.__init__  sZ    	226:4---H/HH-#'4 
r/   r   c                 H     j                   sJ d       t               }	j                         D ]  \  }
}|	j                  d|
 d| d        |	j	                         }	t        j                  d|      }d j                   }t        |j                        }t        j                  ||f      }t        j                  ||      st        d      t        ||	|| j                   |j                  |||d	      t#        j$                  t&        j(                  d
 j+                  |            5  t-        d||dd5 }	 |j/                   j                         j1                         } j4                  rt7        d|       dj9                  g t;        j=                               D cg c]  }| dt?        |           c}d| d|       dz   }tA        jB                  ||      }|jD                  jG                         \  }}}d d d        d d d        tI        tK        d |D                    }|jM                  |jO                         g       tI              d tQ        |       |k(  s	J ||f       t&        j(                  jR                  jU                  tW        tX        jZ                  |tQ        |      d        t\        j^                        }d j                   dta         jb                         } fd}jd                  J   j                   g t&        j(                  jR                  jU                  |j                  t\        j^                         }tg        |jd                  |jh                  |||||tk        jl                  |      tk        jl                  |      	      }to        ||||jq                  d      js                  dd      |      S # t2        $ r Y d d d        d d d        y w xY wc c}w # 1 sw Y   xY w# 1 sw Y   xY w)Nzrequires jinja2r   z : tl.constexpr = r   buf_outtriton_z;64-bit indexing is not yet implemented for triton templatesztl.int32)rK   rN   rR   rS   rT   rU   rV   rW   rX   rY   rG   	get_dtypeTrO   rL   rQ   zGenerated Code:
-=znum_stages=z
num_warps=c              3   <   K   | ]  }|j                           y wr4   )r   ry   s     r0   r|   z*TritonTemplate.generate.<locals>.<genexpr>  s     #F+QAJJL+r}   fallbackrr   c                     t        dt        t        j                        | dd}t	        j
                  |j                  j                        }||fS )NFr
  r.   )rE   rm   r#   KERNEL_NAME	functoolspartialr   r   )out_nodekernelr   kernel_optionsr   r9   s      r0   make_kernel_renderz3TritonTemplate.generate.<locals>.make_kernel_render  s_    )  7 78$ !	F &&F
 6>!r/   )	module_pathmodule_cache_keyrO   r   
extra_argsrR   rS   input_tensor_metaoutput_tensor_metar   r.   ):r   r   r<   writer   r   Bufferr   r%   r   r   r   r   can_use_32bit_indexingNotImplementedErrorrZ   r   r   objectr'   rh   _fake_get_dtyperE   r   r@   ZeroDivisionErrorr  printr   sortedkeysreprr   loadrf   rg   r   r&   extendr   r   r   
size_hintsr   rI   expandr   unbacked_symint_fallbacknextindex_counter__file__r   r>   r   from_irnodesTritonTemplateCallerr   r=   )r9   rK   layoutrR   rS   rW   rX   rY   r   rN   r   r   fake_outrO   numelbuffersr  r7   kwargextramodrr   r   expected_argsr  kernel_hash_namer  r   bmreqr  s   `       `                    @r0   generatezTritonTemplate.generate  s    }}///}*ID#MMD&8R@A (""$99Y/		{+fkk*//+{;66ugF%M  #!II{{###"
 \\GG[$"6"6x"@
 
# 
 	

 }}T]]F;DDF zz)40 *0)>)>  %gQtF5M':&;<)>
 &j\2 %YK0	 
  ""4/C$kk88:OAy!7

> V#F+#FFGh//123I3]!34E 	
H
 	
E WW%%00iM(:(<=>44 1 


 %TYYKqd6H6H1I0JK	" ||'''tyy 
WW((88 ) 

 
 ' WW#!!(55kB)66v>

 $KK$$S$/
 	
K % 

 
!
 

 
s[   1PP
*O&-?P
,PA	P
P&	P/P
0PPP

P	PP!)F)r+   r,   r-   r   countr.  rZ   r  r   rm   __annotations__r   r6   r   r<  rB   rC   s   @r0   r   r     sS    #IOO%M15M4--.7S  S  ~
r/   r   c                   n     e Zd Z	 d	ddd fdZd Zd Z ej                  d      d        Zd
dZ	 xZ
S )ExternKernelChoiceNT)r   has_out_variantc                    t         |           |xs |j                  }t        |      sJ t	        t
        |      rJ d       || _        || _        || _        t        t
        ||       y )Nzduplicate extern kernel)
r5   r6   r+   callablehasattrextern_kernelsr   
cpp_kernelrA  setattr)r9   r  rF  r   rA  r:   s        r0   r6   zExternKernelChoice.__init__%  sd     	&v>40K2KK0	$.f-r/   c                 6    t        t        | j                        S r4   )getattrrE  r   r]   s    r0   to_callablezExternKernelChoice.to_callable6  s    ~tyy11r/   c                      d| j                    S )Nzextern_kernels.r   r]   s    r0   	call_namezExternKernelChoice.call_name9  s     ,,r/   c                    | j                         }| j                  t        |dd      t        |dd      g}	 |j                  t	        j
                  |             t        dj                  |            S # t        $ r Y %w xY w)Nr+   r   r,   r  )	rJ  r   rI  r   inspect	getsource	Exceptionr   r   )r9   r?   partss      r0   hash_keyzExternKernelChoice.hash_key<  s{    IIB
B'Bb)

	LL**2./ %))  		s   $A5 5	B Bc                 D    || _         t        | |||| j                        S )NrA  )ordered_kwargs_for_cpp_kernelExternKernelCallerrA  )r9   rK   r2  rV  r   s        r0   bindzExternKernelChoice.bindJ  s(    -J*!+vvt?S?S
 	
r/   r4   )r.   )r+   r,   r-   r6   rJ  rM  r  	lru_cacherS  rX  rB   rC   s   @r0   r@  r@  $  sJ     .
 ."2- Y* *
r/   r@  c                   <     e Zd Z fdZd Zd Zd Zd Zd Z xZ	S )r1  c                 R    t         |   |||       || _        || _        || _        y r4   )r5   r6   r  debug_extrar;  )r9   r   rK   r2  r  r\  r;  r:   s          r0   r6   zTritonTemplateCaller.__init__R  s-     	{F3"4&
r/   c                V    | j                   J  | j                   j                  |d|iS )Noutput_tensor)r;  	benchmark)r9   outrf   s      r0   r_  zTritonTemplateCaller.benchmarkZ  s.    zz%%%#tzz##T===r/   c                 P    d| j                   j                   d| j                   dS )NzTritonTemplateCaller(r   r   )r;  r  r\  r]   s    r0   __str__zTritonTemplateCaller.__str__^  s*    &tzz'='=&>bAQAQ@RRSTTr/   c                      d| j                    S )Nztemplate_kernels.rL  r]   s    r0   rM  zTritonTemplateCaller.call_namea  s    "499+..r/   c                     dj                  | j                  j                  dd      d   | j                  j                  g      S )Nr  rr   r   r   )r   r   rsplitr;  r  r]   s    r0   rS  zTritonTemplateCaller.hash_keyd  s>    xx		  a(+

++
 	
r/   c                     t         j                  j                  t        j                  | j                  | j
                  | j                              S )N)r2  inputsr  )r   	TensorBoxcreateTritonTemplateBufferr2  rK   r  r]   s    r0   rL   z TritonTemplateCaller.output_nodel  s@    ||""##{{''#'#:#:
 	
r/   )
r+   r,   r-   r6   r_  rb  rM  rS  rL   rB   rC   s   @r0   r1  r1  Q  s"    >U/

r/   r1  c                   P     e Zd Z	 d
dddef fdZd Z fdZd Zd Zd	 Z	 xZ
S )rW  TrU  choicec                n    t         |   |j                  ||       || _        |xs i | _        || _        y r4   )r5   r6   r   rl  r   rA  )r9   rl  rK   r2  r   rA  r:   s         r0   r6   zExternKernelCaller.__init__w  s4     	k6:l.r/   c                 >    d| j                   j                          dS )NzExternKernelCaller(r   )rl  rM  r]   s    r0   rb  zExternKernelCaller.__str__  s    $T[[%:%:%<$=Q??r/   c                n   | j                   rt        |   d|iS | j                           }t        j
                  j                  j                  j                  |t        |j                               t        |j                                      |j                  |       t        fd      S )Nr`  c                         S r4   r.   )algorf   s   r0   <lambda>z.ExternKernelCaller.benchmark.<locals>.<lambda>  s	    D$Kr/   )rA  r5   r_  rJ  torch_C_dynamoguardsassert_size_strider   r   r   copy_r"   )r9   r`  rf   out_newrq  r:   s     ` @r0   r_  zExternKernelCaller.benchmark  s    7$d444##%DDkGHH##66sxxz*E#**,,? IIg/00r/   c                     | j                   j                         }| j                  r t        j                  |fi | j                  S |S r4   )rl  rJ  r   r  r  r   s     r0   rJ  zExternKernelCaller.to_callable  s:    [[$$&;;$$R74;;77Ir/   c                 $   dj                  | j                  j                  gt        | j                  j                               D cg c]  }| dt        | j                  |          ! c}| j                  j                               S c c}w )Nr  r  )r   rl  r   r%  r   r&  r'  rS  )r9   r6  s     r0   rS  zExternKernelCaller.hash_key  s    xx   "((8(8(:!;!; gQtDKK$6789!; $$&	
 		
s   $B
c           
      n   | j                   rt        j                  }nt        j                  }t        j                  j                   || j                  | j                  | j                  j                         | j                  j                  | j                  j                  | j                              S )N)r2  rg  r  rF  rV  r   )rA  r   ExternKernelOutExternKernelAllocrh  ri  r2  rK   rl  rM  rF  rV  r   )r9   clss     r0   rL   zExternKernelCaller.output_node  s    $$C&&C||""{{''{{,,.;;11.2kk.W.W{{	
 		
r/   r4   )r+   r,   r-   r@  r6   rb  r_  rJ  rS  rL   rB   rC   s   @r0   rW  rW  v  s9     / /"/@
1


r/   rW  c                   $     e Zd Zdef fdZ xZS )ErrorFromChoicerl  c                 H    |d| d| z  }t         |   |       || _        y )Nz
From choice r   )r5   r6   rl  )r9   msgrl  
inputs_strr:   s       r0   r6   zErrorFromChoice.__init__  s.    xr*66r/   )r+   r,   r-   r   r6   rB   rC   s   @r0   r  r    s    L  r/   r  c            
           e Zd Z	 d	dee   deeeee	j                  gej                  f   f      fdZe	 d	d       Zed        Zed        Zed        Zy)
AlgorithmSelectorCacheNchoicesinput_gen_fnsc                 x    ddl m} D cg c]  }||	 c}t              dk(  rt        d      t        j                  dt        t                           t              dk(  r"t        d   |      sd   j                         S t        j                  d        fd       fd}t        j                  rddlm}	 |	j                          t!        j                          }
 j#                  |t%        D cg c]  } j'                  |       c}      |      }t!        j                          |
z
  }|i k(  sd   |vrd   j                         S j)                         j*                  rt,        d	   d
xx   dz  cc<   j)                         j*                  s%t        j/                         t0        j2                  k(  r j5                  |||       t7        j8                  ||j:                        j                         }t        j=                  dt        |             |S c c}w c c}w )Nr   )CUDATemplateCallerr   zNo choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. z%Max autotune selects from %s choices.c                  ,    j                         S r4   )make_benchmark_fn)r  r  rK   r2  r9   s   r0   r  z:AlgorithmSelectorCache.__call__.<locals>.make_benchmark_fn  s    ))';VVr/   c                              |       S r4   r.   )r  r  s    r0   autotunez1AlgorithmSelectorCache.__call__.<locals>.autotune  s    &$&w//r/   )tuning_poolinductorselect_algorithm_autotuner>   zselected choice: %s)codegen.cuda.cuda_kernelr  r   RuntimeErrorloginform   rx   rL   r  rY  r   autotune_in_subprocautotune_processr  
initializetimer   r'  key_of
cache_infocurrsizer   getEffectiveLevelloggingDEBUGlog_resultsbuiltinsmin__getitem__r  )r9   r   r  rK   r2  r  r  rl  r  r  autotune_start_tsr{   timingsautotune_elapseselected_choicer  s   ` ````         @r0   __call__zAlgorithmSelectorCache.__call__  s    	A )0Ff63E6Fw<1_  	8#c'l:KLw<1gaj*<=qz--//			T	"	W 
#	W	0 %%5 ""$ IIK+++6+Q$++a.+67	
 ))+(99b=GAJg51:))++'')22Z !<=B=((*33$$&'--7T;I",,wG4G4GHTTV		'_)=>] G< 7s   H2H2H7c                    |i }t        |      D ci c]6  \  }}|j                          |j                  || j                        |      8 }}}t	        |j                               |D cg c]  }t        j                  ||j                            t        j                  j                  j                  |j                         t        j                        t        j                  j                  j                  |j                         t        j                        t        j                  j                  j!                  |j#                         j$                  t        j                               c}| j                  |      t        j                  j'                         j)                         t        j                  j                  j!                  |j$                              t*        r% |d   j,                  di j/                         t0        rt3        t5        |       d       fdfdfdfd}	t        j6                  r|	}
|
S }
|
S c c}}w c c}w )	Nr  r   r`  z tuning requests:c                      d } dg}D ]  }|j                  d | |       d        |dd |        dgz  }dj                  |      S )	Nc           	          dt        | j                               dt        | j                               d| j                  d| j                  j
                  d	S )Nztorch.empty_strided(r   z, dtype=z	, device=r   )r   r   r   dtyperb   rl   )r{   s    r0   tensor_reprzPAlgorithmSelectorCache.make_benchmark_fn.<locals>.debug_str.<locals>.tensor_repr1  sQ    *5?*=Rahhj@Q?T UWWKy0ADr/   z
inputs = [r   ,]zout = r   r   )r   r   )r  linesr{   example_inputsr`  s      r0   	debug_strz;AlgorithmSelectorCache.make_benchmark_fn.<locals>.debug_str0  sg     E $tKN#3156 $cVK$4#56;;E99U##r/   c                 $   j                          t        | t              r | j                  di}n | j                  di}t        r%t        j                  j                  fi t         t
        j                  j                          |S )Nr`  )
zero_rx   rW  r_  r(   rs  testingassert_closecudasynchronize)rl  r   r  example_inputs_externexpectedr`  
out_externs     r0   #benchmark_choice_in_current_processzUAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_choice_in_current_process?  sy    IIK&"45)))+@QjQ *))>CsC**:xJ6JJJ""$Mr/   c                    i }| D ]  }	  |      }|||<    |S # t         $ r4}t        j                  dt        |             t	        d      }Y d }~Ad }~wt
        $ rZ}t        |      }d|v r&|dz  }t        j                  |       t	        d      }nd|v r|dz  }t        ||              Y d }~d }~wt        $ r}t        d| d|       d }~ww xY w)	Nz1CUDA compilation error: 
%s. 
Ignore this choice.infzinvalid argumentz>

This may mean this GPU is too small for max_autotune mode.

zillegal memory accessz*

Either error in template or triton bug.
zIncorrect result from choice z

)r!   r  warningrm   floatr  r  AssertionError)r  r  rl  timinger  r  r  s         r0   benchmark_in_current_processzNAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_in_current_processL  s    G!@HF* #)/ "2 N- ( *KKMsSTv #5\F# 	Ha&C)S0ccC(!&u2c9#RRC-c69;GG	 
 & (7xtA3G s-   	C*ACAB44C CCc                     ddl m} | D cg c]  }t        |t              s| }}| D cg c]  }t        |t              r| }} |      }|j	                  |j                  |             |S c c}w c c}w )Nr   )r  )r   r  rx   rW  updatebenchmark_in_sub_process)r  r  cexterntritonr  r  s         r0   r  zJAlgorithmSelectorCache.make_benchmark_fn.<locals>.benchmark_in_sub_processi  st    * ")NAJq:L,MaFN!(RA
1>P0QaFR26:GNN+DDVLMN ORs   A2A2A7A7)	enumerater   getbenchmark_example_valuer   valuesrs  
as_stridedr'   rh   r   r*  rH   r   r,  r   	size_hintr   r   r   r   r(   r_  cloner  r$  r   r  )r  r  rK   r2  r  r   r{   unique_example_inputsr   r  r_  r  r  r  r  r  r  r`  r  s              @@@@@@@@r0   r  z(AlgorithmSelectorCache.make_benchmark_fn  s>     M
 "+.!
.1 JJLK-++As/J/JKANN. 	 !
 3::<=" *!!
  *
 %j&9&9&;<  ++'')#<< ,    ++))+#<< ,    **))+22#<< +  *!!
& ))&1%%SZZ\177+;+;+E+Efmm+T

  GAJ  "7HZH!'')HS\N"345	$	 		:
	 )) % 	  . 	 ]!

!
s   ;I46DI:c                    t         j                  st         j                  rt        sy dj	                  |D cg c]f  }dj	                  t        t        t        j                  j                  j                  |j                         t         j                                    h c}      }t        j                         t        j                   k(  rd nd}t#        ||j$                        d | }|d   }||   }t&        j(                  j+                  d|  d| d	       |D ]s  }	||	   }
|
r9t&        j(                  j+                  d
|	j,                   d|
dd||
z  dd       Ct&        j(                  j+                  d
|	j,                   d|
dd       u t         j.                  rdnd}t&        j(                  j+                  | d|dd       y c c}w )Nr   r{   r  
   r  r   z	AUTOTUNE r   z)
z   z.4fz ms z.1%r   z ms <DIVIDED BY ZERO ERROR>

SubProcessSingleProcessz AUTOTUNE takes z	 seconds
)r   max_autotunemax_autotune_gemmPRINT_AUTOTUNEr   r   rm   r'   rh   r   r*  rH   r,  r  r  r  r  r%  r  sysstderrr  r   r  )r   rK   r  elapsensizestop_kbest	best_timerl  r   autotune_type_strs               r0   r  z"AlgorithmSelectorCache.log_results}  s   ##v'?'?		 %
 %A ((33JJL63R3R 4  %

 ))+w}}<D"wG$7$78!<QxDM	

9TF!E7#67FV_F

  Qvcl$y7G6LBO 

  Qvcl2OP  #66LO 	 	

-..>vcl*UV?
s   A+Gc                    t        | t        j                        rt        j                  d|       } t        | t        j                        r| j                         } t               5  t        t        j                  j                  j                  | j                         t        j                        t        j                  j                  j                  | j                         t        j                        | j!                         | j#                         | j$                  j&                        cddd       S # 1 sw Y   yxY w)zh
        Convert an ir.Buffer into a concrete torch.Tensor we can use for
        benchmarking.
        faker  )rb   r  
extra_sizeN)rx   r   Layoutr  BaseViewunwrap_viewr   r   r'   rh   r   r*  rH   r   r,  r   
get_devicer	  r2  r   )r   s    r0   r  z.AlgorithmSelectorCache.benchmark_example_value  s     dBII&99VT*DdBKK(##%D  !  ++MMO#<< ,    ++OO%#<< ,  (nn&;;-- "!!s   %C
D99Ec                    t         j                  j                  }| j                         j                  t        | j                               g|j                  | j                         t        j                        |j                  | j                         t        j                        |j                  | j                         j                  t        j                        S )zt
        Extract the pieces of an ir.Buffer that we should invalidate cached
        autotuning results on.
        r  )r'   rh   r   r  rl   rm   r	  r*  rH   r   r,  r   r  r   r   )r   r   s     r0   r  zAlgorithmSelectorCache.key_of  s     77##OO"" !
   88 ! 
   !88 ! 
 !((88  
 	
r/   r4   )r+   r,   r-   r   r   r   r   r   r   r   r  rs  Tensorr  classmethodr  staticmethodr  r  r  r.   r/   r0   r  r    s     SW> l#>  S(BII;3L*M%M NO>@  y yv #W #WJ  6 
 
r/   r  c                  8    t         
t               a t        | i |S r4   )_ALGORITHM_SELECTOR_CACHEr  )rf   r   s     r0   autotune_select_algorithmr    s!     ($:$<!$d5f55r/   c                      t        |       dk(  r?t        j                  j                  t        j                  j	                  | d               S | D cg c]  }t        |       c}S c c}w )Nr   r   )r   r   ExternKernelrequire_stride1realize_inputrealize_inputs)rf   r{   s     r0   r  r    sS    
4yA~..r/L/LTRSW/UVV'+,t!N1t,,,s   A')lowering)Vr  r  rO  r   r  r  ro   r  ior   typingr   r   r   r   r   r	   r
   unittest.mockr   rI   rs  torch._dynamo.testingr   torch._dynamo.utilsr   r   r   r   r   r   r  r   r   	codecacher   r   r   codegen.commonr   r   r   codegen.tritonr   r   r   r   codegen.triton_utilsr   r    excr!   utilsr"   r#   r$   r%   r&   virtualizedr'   	getLoggerr+   r  rZ   r(   rm   r>  r  r  r*   rE  r2   rE   rY  r   r   r@  r1  rW  r  r  r  r  r  r  r  r.   r/   r0   <module>r     sJ        
    C C C    . F F  @ > > H H P P > ! J J g! S#X 	 	
 !" *E< EP
 T J
^ J
Z*
 *
Z"
< "
J@
 @
Fl U
_ U
p ! 6- r/   